Пример #1
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("Simple crawl demo");

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                new HtmlDocumentProcessor(), // Process html
                new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction
                new GoogleLanguageDetection(), // Add language detection
                new Mp3FileProcessor(), // Add language detection
                new DumperStep())
                {
                    // Custom step to visualize crawl
                    MaximumThreadCount = 10,
                    MaximumCrawlDepth = 10,
                    ExcludeFilter = Program.ExtensionsToSkip,
                })
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #2
0
 private void button1_Click(object sender, EventArgs e)
 {
     Crawler = new Crawler(textBox1.Text);
     bindingSource1.DataSource = Crawler;
     Crawler.Crawl(textBox1.Text);
     bindingSource1.ResetBindings(false);
 }
Пример #3
0
        private void btnCrawl_Click(object sender, EventArgs e)
        {
            folderBrowserDialog1.ShowDialog();
            string startingPath = folderBrowserDialog1.SelectedPath;

            try
            {
                if (startingPath != String.Empty)
                {
                    crawler.Crawl(startingPath);
                }
                textBoxMessages.Text = $"Found {crawler.GetFiles().Count} music files.";

                if (crawler.GetFiles() != null)
                {
                    textBoxMessages.Text = "Tagging all the files. Hang on.";

                    var resultFiles = new Dictionary <string, File>();

                    tagger = new Tagger(resultFiles);
                    tagger.RunTagJob(crawler.GetFiles(), (list) =>
                    {
                        SetGridDataDelegate d = new SetGridDataDelegate(SetGridData);
                        this.Invoke(d, new object[] { list });
                    });
                    buttonSaveAsPlaylist.Visible = true;
                }
            }
            catch (Exception ex)
            {
                ErrorLogger.LogError(ex);
            }
        }
Пример #4
0
        public static void Run(frmMain parentForm, Book book)
        {
            form = parentForm;
            IsolatedStorageModule.Setup(false);

            currentBook = book;

            existingReviewIds = CrawlUtil.getNewContext().Reviews.Where(r => r.bookId == currentBook.id).Select(r => r.id).ToList();

            baseUri = "http://www.goodreads.com/api/reviews_widget_iframe?did=DEVELOPER_ID&amp;format=html&amp;isbn=" + book.isbn + "&amp;links=660&amp;min_rating=&amp;review_back=fff&amp;stars=000&amp;text=000";

            c = new Crawler(new Uri(baseUri),
                            new HtmlDocumentProcessor(), // Process html
                            new ReviewIFrameDumperStep());

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1; //** 2012-09-03 changed this from 2 to 1 in hopes that it'll fix the unknown (seemingly) random crashes.
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
Пример #5
0
        public static void Run(frmMain parentForm, User user)
        {
            form    = parentForm;
            count   = 0;
            maxPage = 1;

            //use in-memory storage

            baseUri = string.Format("http://www.goodreads.com/user/{0}/favorite_authors", user.id);

            Crawler c = new Crawler(new Uri(baseUri),
                                    new HtmlDocumentProcessor(), // Process html
                                    new CrawlFavouriteAuthors_DumperStep(user));

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsAndPagesToSkip;

            c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
Пример #6
0
        public static void Run(SavePageForm parentForm, string url)
        {
            Form = parentForm;
            Url  = url;

            c = new Crawler(new Uri(url),
                            new HtmlDocumentProcessor(), // Process html
                            new SaveFileStep());

            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsToSkip;

            c.AdhereToRobotRules = false;
            c.CrawlFinished     += new EventHandler <NCrawler.Events.CrawlFinishedEventArgs>(c_CrawlFinished);

            string ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);

            //if there are no unblocked user agents left then reset the tracker and retry
            if (ua == null)
            {
                UserAgentTracker = CrawlUtil.InitUserAgentTracker();
                ua = CrawlUtil.GetRandomUnblockedUserAgent(UserAgentTracker);
            }
            c.UserAgent = ua;

            // Begin crawl
            c.Crawl();
        }
Пример #7
0
		public static void Run()
		{
			NCrawlerModule.Setup();

			// Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below
			NCrawlerModule.Register(builder =>
				builder.Register((c, p) =>
					{
						NCrawlerModule.Setup(); // Return to standard setup
						return new CustomCrawlerRules(p.TypedAs<Crawler>(), c.Resolve<IRobot>(p), p.TypedAs<Uri>(),
							p.TypedAs<ICrawlerHistory>());
					}).
				As<ICrawlerRules>().
				InstancePerDependency());

			Console.Out.WriteLine("Advanced crawl demo");

			using (Crawler c = new Crawler(
				new Uri("http://ncrawler.codeplex.com"),
				new HtmlDocumentProcessor(), // Process html
				new DumperStep())
				{
					MaximumThreadCount = 2,
					MaximumCrawlDepth = 2,
					ExcludeFilter = Program.ExtensionsToSkip,
				})
			{
				// Begin crawl
				c.Crawl();
			}
		}
Пример #8
0
            void SetSnapshot(DataRenderer dataRenderer, PackedMemorySnapshot snapshot)
            {
                if (snapshot == null)
                {
                    m_RawSnapshot   = null;
                    m_RawSchema     = null;
                    SchemaToDisplay = null;
                    UpdateTableSelectionNames();
                    return;
                }

                m_RawSnapshot = snapshot;

                ProgressBarDisplay.ShowBar(string.Format("Opening snapshot: {0}", System.IO.Path.GetFileNameWithoutExtension(snapshot.filePath)));

                var cachedSnapshot = new CachedSnapshot(snapshot);

                using (Profiling.GetMarker(Profiling.MarkerId.CrawlManagedData).Auto())
                {
                    var crawling = Crawler.Crawl(cachedSnapshot);
                    crawling.MoveNext(); //start execution

                    var   status          = crawling.Current as EnumerationStatus;
                    float progressPerStep = 1.0f / status.StepCount;
                    while (crawling.MoveNext())
                    {
                        ProgressBarDisplay.UpdateProgress(status.CurrentStep * progressPerStep, status.StepStatus);
                    }
                }
                ProgressBarDisplay.ClearBar();

                m_RawSchema = new RawSchema();
                m_RawSchema.SetupSchema(cachedSnapshot, dataRenderer);

                SchemaToDisplay = m_RawSchema;
                if (k_DefaultViewFilePath.Length > 0)
                {
                    using (ScopeDebugContext.Func(() => { return("File '" + k_DefaultViewFilePath + "'"); }))
                    {
                        Database.View.ViewSchema.Builder builder = null;
                        using (Profiling.GetMarker(Profiling.MarkerId.LoadViewDefinitionFile).Auto())
                        {
                            builder = Database.View.ViewSchema.Builder.LoadFromXMLFile(k_DefaultViewFilePath);
                        }
                        if (builder != null)
                        {
                            using (Profiling.GetMarker(Profiling.MarkerId.BuildViewDefinitionFile).Auto())
                            {
                                ViewSchema = builder.Build(m_RawSchema);
                            }
                            if (ViewSchema != null)
                            {
                                SchemaToDisplay = ViewSchema;
                            }
                        }
                    }
                }

                UpdateTableSelectionNames();
            }
Пример #9
0
        public override bool IsExternalUrl(Uri uri)
        {
            // Is External Url
            if (base.IsExternalUrl(uri))
            {
                // Yes, check if we have crawled it before
                if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(UriSensitivity)))
                {
                    // Create child crawler to traverse external site with max 2 levels
                    using (Crawler externalCrawler = new Crawler(uri,
                                                                 new HtmlDocumentProcessor(), // Process html
                                                                 new DumperStep())
                    {
                        MaximumThreadCount = 1,
                        MaximumCrawlDepth = 2,
                        MaximumCrawlCount = 10,
                        ExcludeFilter = Program.ExtensionsToSkip,
                    })
                    {
                        // Crawl external site
                        externalCrawler.Crawl();
                    }
                }

                // Do not follow link on this crawler
                return(true);
            }

            return(false);
        }
Пример #10
0
        public static void Run(frmMain parentForm, User user)
        {
            form = parentForm;

            //use in-memory storage

            baseUri = "http://www.goodreads.com/user/show/" + user.userIdString;

            //http://www.goodreads.com/user/show/104320-erin-beck
            //http://www.goodreads.com/author/show/3360351.Ryan_Dilbert


            Crawler c = new Crawler(new Uri(baseUri),
                                    new HtmlDocumentProcessor(), // Process html
                                    new UserProfileDumperStep(user));

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsAndPagesToSkip;

            c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
Пример #11
0
        static void Main(string[] args)
        {
            var crawler = new Crawler("./Sample");
            var dir     = crawler.Crawl(null, "./", "./");

            Console.WriteLine(JsonConvert.SerializeObject(dir, Formatting.Indented));
        }
Пример #12
0
 public ActionResult Crawl(int mappingCode)
 {
     var mapping = _db.Mappings.Include(m => m.Urls).Include(m => m.Properties).SingleOrDefault(m => m.Id == mappingCode);
     List<RecordValueViewModel> result = Crawler.Crawl(Factory.Convert(mapping));
     return Json(result, JsonRequestBehavior.AllowGet);
     //return PartialView(result);
 }
Пример #13
0
        public override bool IsExternalUrl(Uri uri)
        {
            // Is External Url
            if (!base.IsExternalUrl(uri))
            {
                return false;
            }

            // Yes, check if we have crawled it before
            if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(m_Crawler.UriSensitivity)))
            {
                return true;
            }

            // Create child crawler to traverse external site with max 2 levels
            using (Crawler externalCrawler = new Crawler(uri,
                new HtmlDocumentProcessor(), // Process html
                new DumperStep())
                {
                    MaximumThreadCount = 1,
                    MaximumCrawlDepth = 2,
                    MaximumCrawlCount = 10,
                    ExcludeFilter = Program.ExtensionsToSkip,
                })
            {
                // Crawl external site
                externalCrawler.Crawl();
            }

            // Do not follow link on this crawler
            return true;
        }
Пример #14
0
        public static void Run(frmMain parentForm, User user)
        {
            form  = parentForm;
            count = 0;

            //use in-memory storage

            baseUri = "http://www.goodreads.com/list/user_votes/" + user.userIdString;

            Crawler c = new Crawler(new Uri(baseUri),
                                    new HtmlDocumentProcessor(), // Process html
                                    new CrawlListAndVotes_DumperStep(user));

            // Custom step to visualize crawl
            c.MaximumThreadCount = 1;
            c.MaximumCrawlDepth  = 1;
            c.ExcludeFilter      = CrawlUtil.ExtensionsAndPagesToSkip;

            c.BeforeDownload += new EventHandler <NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);

            c.AdhereToRobotRules = false;

            // Begin crawl
            c.Crawl();
        }
Пример #15
0
        static void Main(string[] args)
        {
            string  startUrl  = "https://www.cnblogs.com/Xy--1/";
            Crawler myCrawler = new Crawler(startUrl);

            myCrawler.Crawl();
        }
Пример #16
0
        static void Main(string[] args)
        {
            var helpArgs = new string[] { "help", "-help", "/help" };

            if (args.Any(x => helpArgs.Contains(x.ToLower())))
            {
                ShowUsage();
                return;
            }

            ICrawler crawler = new Crawler();

            Console.WriteLine(
                $"AssemblyName\tType\tPath" +
                $"\tIsPackage\tPackageId\tRepositoryUrl" +
                $"\tDescription\tTargetFrameworks");

            foreach (ProjInfo pi in crawler.Crawl(Environment.CurrentDirectory))
            {
                string relativePath = PathHelper.GetRelativePath(Environment.CurrentDirectory, pi.CsProjPath);
                Console.WriteLine(
                    $"{pi.AssName}\t{pi.AssType}\t{relativePath}" +
                    $"\t{pi.GeneratePackage}\t{pi.PackageId}\t{pi.RepositoryUrl}" +
                    $"\t{pi.Description}\t{pi.TargetFrameworks}");
            }
        }
Пример #17
0
        public static void Run()
        {
            IsolatedStorageModule.Setup(false);
            Console.Out.WriteLine("Simple crawl demo using IsolatedStorage");

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                                       new HtmlDocumentProcessor(),                         // Process html
                                       new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction
                                       new GoogleLanguageDetection(),                       // Add language detection
                                       new DumperStep())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = 2,
                MaximumCrawlDepth = 10,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #18
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("\nSimple indexer demo");

            // Setup crawler to crawl/index http://ncrawler.codeplex.com
            //  * Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Custom step, that is supposed to send content to an Index or Database
            using (var c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                                       new HtmlDocumentProcessor( // Process html, filter links and content
                                           // Setup filter that removed all the text between <body and </body>
                                           // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler-->
                                           // or whatever you prefer. This way you can control what text is extracted on every page
                                           // Most cases you want just to filter the header information or menu text
                                           new Dictionary <string, string>
            {
                { "<body", "</body>" }
            },
                                           // Setup filter that tells the crawler not to follow links between tags
                                           // that start with <head and ends with </head>. This can be custom tags like
                                           // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer.
                                           // This was you can control what links the crawler should not follow
                                           new Dictionary <string, string>
            {
                { "<head", "</head>" }
            }),
                                       new IndexerDemo())
            {
                MaximumThreadCount = 2
            })                     // Custom Step to send filtered content to index
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #19
0
        public void MaximumCrawlTime()
        {
            TestModule.SetupInMemoryStorage();

            // Setup
            Stopwatch timer;

            using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = 10,
                MaximumCrawlDepth = 10,
                MaximumCrawlTime = TimeSpan.FromSeconds(2)
            })
            {
                timer = Stopwatch.StartNew();

                // Run
                c.Crawl();
                timer.Stop();
            }

            // Allow time for gracefull finish
            Assert.Less(timer.ElapsedMilliseconds, 10000);
        }
Пример #20
0
        public static void Main(string[] args)
        {
            Crawler crawler = new Crawler(new Uri("http://allrecipes.com/"));

            crawler.LoadRobotsTxt().Wait();
            crawler.Crawl().Wait();
        }
Пример #21
0
        public static void CrawlWith(
            [Required] string address,
            [DefaultValue(true)] bool verbose,
            [DefaultValue(true)] bool includeImages,
            [DefaultValue(true)] bool includeLinks,
            [DefaultValue(true)] bool includeScripts,
            [DefaultValue(true)] bool includeStyles,
            [DefaultValue(true)] bool includeFailureCheck,
            [DefaultValue(true)] bool includeRobots,
            [DefaultValue(100)] int maxDepth,
            [DefaultValue(0)] int delay,
            [DefaultValue("")] string searchExpression,
            [DefaultValue("")] string partnerSites)
        {
            var config = GetComplexConfig(address, verbose, includeImages, includeLinks, includeScripts, includeStyles, includeFailureCheck, includeRobots, maxDepth, delay, searchExpression, partnerSites);

            Console.WriteLine(JsonConvert.SerializeObject(config));

            Crawler.Crawl(config);

            if (config.Listener.GetCrawlResult().ErrorCount > 0)
            {
                Environment.Exit((int)ExitCode.CrawlError);
            }
        }
Пример #22
0
 public void TestMe()
 {
     throw new NotImplementedException();
     var crawler = new Crawler();
     crawler.Crawl("http://google.com");
     Thread.Sleep(3000);
 }
Пример #23
0
		public static void Run()
		{
			NCrawlerModule.Setup();
			Console.Out.WriteLine("\nSimple indexer demo");

			// Setup crawler to crawl/index http://ncrawler.codeplex.com
			// 	* Step 1 - The Html Processor, parses and extracts links, text and more from html
			//  * Step 2 - Custom step, that is supposed to send content to an Index or Database
			using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
				new HtmlDocumentProcessor( // Process html, filter links and content
				// Setup filter that removed all the text between <body and </body>
				// This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler-->
				// or whatever you prefer. This way you can control what text is extracted on every page
				// Most cases you want just to filter the header information or menu text
					new Dictionary<string, string>
						{
							{"<body", "</body>"}
						},
				// Setup filter that tells the crawler not to follow links between tags
				// that start with <head and ends with </head>. This can be custom tags like
				// <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer.
				// This was you can control what links the crawler should not follow
					new Dictionary<string, string>
						{
							{"<head", "</head>"}
						}),
				new IndexerDemo())
				{
					MaximumThreadCount = 2
				}) // Custom Step to send filtered content to index
			{
				// Begin crawl
				c.Crawl();
			}
		}
Пример #24
0
        public static void Run()
        {
            NCrawlerModule.Setup();

            // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below
            NCrawlerModule.Register(builder =>
                                    builder.Register((c, p) =>
            {
                NCrawlerModule.Setup();                                 // Return to standard setup
                return(new CustomCrawlerRules(p.TypedAs <Crawler>(), c.Resolve <IRobot>(p), p.TypedAs <Uri>(),
                                              p.TypedAs <ICrawlerHistory>()));
            }).
                                    As <ICrawlerRules>().
                                    InstancePerDependency());

            Console.Out.WriteLine("Advanced crawl demo");

            using (var c = new Crawler(
                       new Uri("http://ncrawler.codeplex.com"),
                       new HtmlDocumentProcessor(),          // Process html
                       new DumperStep())
            {
                MaximumThreadCount = 2,
                MaximumCrawlDepth = 2,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #25
0
        public static void Main(string[] args)
        {
            try
            {
                string sequenceType = args[0];
                string start = args[1];
                long max = long.Parse(args[2]);
                int pause = int.Parse(args[3]); // Pause interval

                if (args.Length < 4)
                {
                    throw new ArgumentException("Invalid number of arguments!");
                }

                CrawlContext context = GetCrawlContext(sequenceType, start, max, pause);
                Crawler crawler = CreateCrawler(context);

                crawler.Crawl(max, context.QueryType);
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                Trace.TraceError(e.ToString());
            }
        }
Пример #26
0
        public void TestMe()
        {
            throw new NotImplementedException();
            var crawler = new Crawler();

            crawler.Crawl("http://google.com");
            Thread.Sleep(3000);
        }
Пример #27
0
        static void Main(string[] args)
        {
            Crawler crawler = new Crawler();
            IObservable<Uri> observable1 = crawler.Crawl(new Uri("http://www.codinghorror.com/"));

            observable1.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed"));

            Console.ReadLine();
        }
Пример #28
0
        static void Main(string[] args)
        {
            Crawler           crawler     = new Crawler();
            IObservable <Uri> observable1 = crawler.Crawl(new Uri("http://www.codinghorror.com/"));

            observable1.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed"));

            Console.ReadLine();
        }
Пример #29
0
        private async void button1_Click(object sender, EventArgs e)
        {
            LogHelper log = new LogHelper(typeof(Form1));

            using (Crawler spider = new Crawler())
            {
                string html = await spider.Crawl(ConstVar.AreaUrl, Encoding.UTF8);
            }
        }
Пример #30
0
        public void Run()
        {
            Console.WriteLine(typeof(DemoOne).Name);
            // 使用爬取框架,异常可通过异常事件来捕捉
            string  url = "https://www.baidu.com/";
            Crawler c   = new Crawler(url);

            // 此爬取没有展示,下面开始开始配置
            Console.WriteLine("未设置管线");
            c.Crawl();
            Console.WriteLine("第一次请求结束\r");

            // 第二次请求添加管线处理程序
            c.AddPipeline(new DemoOneDeal());
            c.Crawl();
            Console.WriteLine("第二次请求结束\r");
            Console.WriteLine();
        }
Пример #31
0
        static void Main(string[] args)
        {
            Crawler           crawler    = new Crawler();
            IObservable <Uri> observable = crawler.Crawl(new Uri("https://dotnet.microsoft.com"));

            observable.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed"));

            Console.ReadLine();
        }
Пример #32
0
        public void StartCrawlers(Crawler mycrawler)
        {
            Thread thread_1 = new Thread(() => mycrawler.Crawl());

            thread_1.Name = "爬虫线程1";
            Thread thread_2 = new Thread(() => mycrawler.Crawl());

            thread_2.Name = "爬虫线程2";
            Thread thread_3 = new Thread(() => mycrawler.Crawl());

            thread_3.Name = "爬虫线程3";

            Task[] tasks =
            { Task.Run(() => thread_1.Start()),     Task.Run(() => thread_2.Start()),
              Task.Run(() => thread_3.Start()), };

            Task.WaitAll(tasks);
        }
Пример #33
0
        static void Main(string[] args)
        {
            Uri     uri = new Uri("http://www.csdn.net/");
            Crawler c   = new Crawler(uri, new HtmlDocumentProcessor(), new DumperStep());

            c.MaximumThreadCount = 30; //线程数量
            c.MaximumCrawlDepth  = 2;  //爬行深度
            c.Crawl();                 //开始爬行
        }
Пример #34
0
        public async Task <ActionResult> Deploy()
        {
            var c = new Crawler();
            await c.Crawl(new Uri(Request.Url.GetLeftPart(UriPartial.Authority)), CloudConfigurationManager.GetSetting("BasicPassword"), "");

            return(new ContentResult {
                Content = string.Join("<br>", c.CrawledUrls) + string.Join("<br>", c.log.entires)
            });
        }
        private Crawler TestCrawlerMethod(string path, int expectedCount, int recursionLimit)
        {
            List <string> uriList = new List <String>();

            uriList.Add(path);

            Crawler crawler = new Crawler(uriList, new Browser(), recursionLimit);

            crawler.Crawl();

            foreach (HttpRequestResult result in crawler.HttpRequestResults)
            {
                try
                {
                    if (result.Error != null)
                    {
                        Console.WriteLine("The error property indicated a {1}, at {0} with the message, \"{2}\"", result.Error.AbsoluteUri.ToString() ?? "null", result.Error.HttpCode.ToString() ?? "null", result.Error.Message.ToString() ?? "null");
                    }
                    else if (result.ContentType != null && result.IsHtml && result.Content != null)
                    {
                        Console.WriteLine("Content for requestUrl, {0}, is as follows:\n{1}", result.RequestUrl, result.Content);
                    }
                    else if (result.ContentType == null)
                    {
                        Console.WriteLine("ContentType for requestUrl, {0}, is null.", result.RequestUrl);
                    }
                    else if (!result.IsHtml)
                    {
                        Console.WriteLine("ContentType for requestUrl, {0}, is not html.", result.RequestUrl);
                    }
                    else if (result.Content == null)
                    {
                        Console.WriteLine("Content for requestUrl, {0}, is null.", result.RequestUrl);
                    }
                    else
                    {
                        Console.WriteLine("Problem writing result information to console.");
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("The following exception occurred while attempting to write information about the reuslt.");
                    Console.WriteLine(ex);
                }
            }

            Assert.AreEqual(expectedCount, crawler.HttpRequestResults.Count);

            AssertLinksFromRemoteSiteNotRetrieved(crawler);
            AssertLinksNullStateForCssAndHtmlTypes(crawler);
            AssertBadLinksHaveNullAbsoluteUriAndPopulatedEx(crawler);

            return(crawler);
        }
Пример #36
0
        public void TheRootAddressShouldBeCrawled()
        {
            var config = new CrawlerConfig
            {
                RootAddress  = new Uri("http://localhost:51746/"),
                Listener     = this,
                MaxDepth     = 1,
                CrawlerFlags = CrawlerFlags.IncludeLinks | CrawlerFlags.IncludeFailureCheck
            };

            Crawler.Crawl(config);
        }
Пример #37
0
 //This is a basic example of how to use the crawler
 //In case of a cache miss, it prints out the page's title and
 //absolute URI, and saves the page data to the filesystem.
 public static void Main(String[] args)
 {
     if ((args.Length == 2 || args.Length == 3) &&
         Uri.IsWellFormedUriString(args[0], UriKind.Absolute))
     {
         Uri startingUri = new Uri(args[0]);
         String targetDirectoryPath = args[1];
         bool followExternal =
             args.Length == 3 && args[2] == "--follow-external";
         Console.WriteLine("Loading from cache...");
         Cache cache = new Cache(startingUri, targetDirectoryPath);
         Console.WriteLine(
             "Cache loaded - {0} pages stored in cache", cache.Count());
         Crawler crawler = new Crawler(cache, followExternal);
         Persister persister = new Persister(targetDirectoryPath, startingUri);
         //This event is fired when the crawler's process is over
         crawler.WorkComplete += () =>
         {
             Environment.Exit(0);
         };
         //This event is fired every time a valid page is downloaded
         crawler.NewPageFetched += (page) =>
         {
             Console.WriteLine(page.Title + " - " + page.Uri.AbsoluteUri);
             persister.Save(page);
         };
         //starts the crawler, on a different thread
         crawler.Crawl(startingUri);
         Console.WriteLine("Crawler started, press CTRL+C to interrupt");
         while (true) { }
     }
     else
     {
         Console.WriteLine("Crawler");
         Console.WriteLine("Usage:");
         Console.WriteLine(
             "Tenteikura.Example.exe <starting_uri> <target_directory> [--options]");
         Console.WriteLine(
             "<starting_uri> : a valid absolute URL which will be the starting point for the crawler");
         Console.WriteLine(
             "<target_directory> : the directory where the page files will be saved");
         Console.WriteLine("");
         Console.WriteLine("OPTIONS:");
         Console.WriteLine(
             "The only option available is --follow-external, which will make the crawler fetch non-local urls as well");
         Console.WriteLine("EXAMPLE: ");
         Console.WriteLine(
             @"Tenteikura.Example.exe http://telenor.com C:\mytargetdirectory --follow-external");
     }
 }
Пример #38
0
        static void Main(string[] args)
        {
            ThreadPool.SetMinThreads(200, 200);
            var crawler = new Crawler();
            var fileStore = new FileStore(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Cache"));
            var cachingHandler = new CachingHandler(fileStore)
                {
                    InnerHandler = new HttpClientHandler()
                };

            crawler.Requester = new HttpClient(cachingHandler);
            if(ConfigurationManager.ConnectionStrings["UrlStore"]!=null)
                crawler.Store = new UrlStore();
            crawler.Crawl("http://YAHOO.COM");
            Console.Read();
        }
Пример #39
0
		public static void Run()
		{
			NCrawlerModule.Setup();
			// Demo 2 - Find broken links
			Console.Out.WriteLine("\nFind broken links demo");

			// Setup crawler to crawl http://ncrawler.codeplex.com
			// with 2 thread adhering to robot rules, and maximum depth
			// of 2 with 2 pipeline steps
			NCrawlerModule.Setup();
			using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
				new HtmlDocumentProcessor(), // Process html
				new DumpBrokenLinksStep()) // Custom pipeline Step
				{
					MaximumThreadCount = 5,
					MaximumCrawlDepth = 2,
				})
			{
				// Begin crawl
				c.Crawl();
			}
		}
Пример #40
0
        public void MaximumCrawlTime()
        {
            TestModule.SetupInMemoryStorage();

            // Setup
            Stopwatch timer;
            using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor())
                {
                    // Custom step to visualize crawl
                    MaximumThreadCount = 10,
                    MaximumCrawlDepth = 10,
                    MaximumCrawlTime = TimeSpan.FromSeconds(2)
                })
            {
                timer = Stopwatch.StartNew();

                // Run
                c.Crawl();
                timer.Stop();
            }

            // Allow time for gracefull finish
            Assert.Less(timer.ElapsedMilliseconds, 10000);
        }
Пример #41
0
 private static CollectorStep CollectionCrawl()
 {
     CollectorStep collectorStep = new CollectorStep();
     HtmlDocumentProcessor htmlDocumentProcessor = new HtmlDocumentProcessor();
     using (Crawler crawler = new Crawler(new Uri("http://ncrawler.codeplex.com"), collectorStep, htmlDocumentProcessor))
     {
         Console.Out.WriteLine(crawler.GetType());
         crawler.MaximumThreadCount = 5;
         crawler.UriSensitivity = UriComponents.HttpRequestUrl;
         crawler.ExcludeFilter = new[]
             {
                 new RegexFilter(
                     new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png)",
                         RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase))
             };
         crawler.Crawl();
         return collectorStep;
     }
 }