Crawler, ChiakiYu C# (CSharp) Code-Beispiele

Beispiel #1

0

Datei anzeigen

Datei: TaskController.cs Projekt: hsb0307/Crawler5

        public ActionResult Create(Crawler.Entity.Task task)
        {
            if (ModelState.IsValid)
            {
                task.ID = Guid.NewGuid();
                //db.Tasks.Add(task);
                //db.SaveChanges();
                taskService.Create(task);

                return RedirectToAction("Index");
            }

            ViewBag.CategoryId = new SelectList(categoryService.Table.ToList(), "ID", "Name", task.CategoryId);//db.TaskCategories
            return View(task);

            //try
            //{
            //    // TODO: Add insert logic here

            //    return RedirectToAction("Index");
            //}
            //catch
            //{
            //    return View();
            //}
        }

Beispiel #2

0

Datei anzeigen

Datei: InternetExplorerHtmlDocumentProcessor.cs Projekt: senzacionale/ncrawler

		public override void Process(Crawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define.
				NotNull(crawler, "crawler").
				NotNull(propertyBag, "propertyBag");

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return;
			}

			if (!IsHtmlContent(propertyBag.ContentType))
			{
				return;
			}

			string documentDomHtml = string.Empty;
			Thread tempThread = new Thread(o =>
				{
					using (TridentBrowserForm internetExplorer = new TridentBrowserForm(propertyBag.ResponseUri.ToString()))
					{
						Application.Run(internetExplorer);
						documentDomHtml = internetExplorer.DocumentDomHtml;
					}
				});
			tempThread.SetApartmentState(ApartmentState.STA);
			tempThread.Start();
			tempThread.Join();

			propertyBag.GetResponse = () => new MemoryStream(Encoding.UTF8.GetBytes(documentDomHtml));
			base.Process(crawler, propertyBag);
		}

Beispiel #3

0

Datei anzeigen

Datei: TaskItemController.cs Projekt: hsb0307/Crawler5

        public ActionResult Create(Crawler.Entity.TaskItem item, string createAddCaptureRule, string captureRuleForContent, string captureRuleForNavigation)
        {
            if (ModelState.IsValid)
            {
                item.ID = Guid.NewGuid();

                if (!string.IsNullOrEmpty(captureRuleForContent))
                {
                    Crawler.Entity.CaptureRule captureRuleCon = new Crawler.Entity.CaptureRule { ID = new Guid(captureRuleForContent) };
                    captureRuleService.Context.Set<Crawler.Entity.CaptureRule>().Attach(captureRuleCon);
                    item.CaptureRules.Add(captureRuleCon);
                }
                if (!string.IsNullOrEmpty(captureRuleForNavigation))
                {
                    CaptureRule captureRuleNav = new CaptureRule { ID = new Guid(captureRuleForNavigation) };
                    captureRuleService.Context.Set<CaptureRule>().Attach(captureRuleNav);
                    item.CaptureRules.Add(captureRuleNav);
                }

                taskItemService.Create(item);
                if (string.IsNullOrEmpty(createAddCaptureRule))
                {
                    return RedirectToAction("Index", new { taskid = item.TaskId });
                }
                else
                {
                    return RedirectToAction("Create", "CaptureRule", new { taskitemid = item.ID });
                }

            }

            ViewBag.TaskId = new SelectList(taskService.GetAll(), "ID", "Name", item.TaskId);
            ViewBag.PageCategory = new SelectList(DictionaryDataUtil.GetData("pageType"), "ID", "Name", 1);
            return View(item);
        }

Beispiel #4

0

Datei anzeigen

Datei: MP3FileProcessor.cs Projekt: fzhenmei/study

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            using (TempFile tempFile = new TempFile())
            {
                using (FileStream fs = new FileStream(tempFile.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
                using (Stream input = propertyBag.GetResponse())
                {
                    input.CopyToStream(fs);
                }

                UltraID3 id3 = new UltraID3();
                id3.Read(tempFile.FileName);

                propertyBag["MP3_Album"].Value = id3.Album;
                propertyBag["MP3_Artist"].Value = id3.Artist;
                propertyBag["MP3_Comments"].Value = id3.Comments;
                propertyBag["MP3_Duration"].Value = id3.Duration;
                propertyBag["MP3_Genre"].Value = id3.Genre;
                propertyBag["MP3_Title"].Value = id3.Title;
            }
        }

Beispiel #5

0

Datei anzeigen

Datei: AdvancedCrawlDemo.cs Projekt: senzacionale/ncrawler

		public static void Run()
		{
			NCrawlerModule.Setup();

			// Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below
			NCrawlerModule.Register(builder =>
				builder.Register((c, p) =>
					{
						NCrawlerModule.Setup(); // Return to standard setup
						return new CustomCrawlerRules(p.TypedAs<Crawler>(), c.Resolve<IRobot>(p), p.TypedAs<Uri>(),
							p.TypedAs<ICrawlerHistory>());
					}).
				As<ICrawlerRules>().
				InstancePerDependency());

			Console.Out.WriteLine("Advanced crawl demo");

			using (Crawler c = new Crawler(
				new Uri("http://ncrawler.codeplex.com"),
				new HtmlDocumentProcessor(), // Process html
				new DumperStep())
				{
					MaximumThreadCount = 2,
					MaximumCrawlDepth = 2,
					ExcludeFilter = Program.ExtensionsToSkip,
				})
			{
				// Begin crawl
				c.Crawl();
			}
		}

Beispiel #6

0

Datei anzeigen

Datei: Spiders.cs Projekt: alexbikfalvi/InetAnalytics

        /// <summary>
        /// Creates a new spider collection instance.
        /// </summary>
        /// <param name="crawler">The crawler object.</param>
        public Spiders(Crawler crawler)
        {
            this.crawler = crawler;

            // Create the spiders.
            this.spiderStandardFeeds = new SpiderStandardFeeds(this.crawler);
        }

Beispiel #7

0

Datei anzeigen

Datei: IndexerDemo.cs Projekt: senzacionale/ncrawler

		public static void Run()
		{
			NCrawlerModule.Setup();
			Console.Out.WriteLine("\nSimple indexer demo");

			// Setup crawler to crawl/index http://ncrawler.codeplex.com
			// 	* Step 1 - The Html Processor, parses and extracts links, text and more from html
			//  * Step 2 - Custom step, that is supposed to send content to an Index or Database
			using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
				new HtmlDocumentProcessor( // Process html, filter links and content
				// Setup filter that removed all the text between <body and </body>
				// This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler-->
				// or whatever you prefer. This way you can control what text is extracted on every page
				// Most cases you want just to filter the header information or menu text
					new Dictionary<string, string>
						{
							{"<body", "</body>"}
						},
				// Setup filter that tells the crawler not to follow links between tags
				// that start with <head and ends with </head>. This can be custom tags like
				// <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer.
				// This was you can control what links the crawler should not follow
					new Dictionary<string, string>
						{
							{"<head", "</head>"}
						}),
				new IndexerDemo())
				{
					MaximumThreadCount = 2
				}) // Custom Step to send filtered content to index
			{
				// Begin crawl
				c.Crawl();
			}
		}

Beispiel #8

0

Datei anzeigen

Datei: IFilterProcessor.cs Projekt: senzacionale/ncrawler

		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return;
			}

			string extension = MapContentTypeToExtension(propertyBag.ContentType);
			if (extension.IsNullOrEmpty())
			{
				return;
			}

			propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
			using (TempFile temp = new TempFile())
			{
				temp.FileName += "." + extension;
				using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
				using (Stream input = propertyBag.GetResponse())
				{
					input.CopyToStream(fs);
				}

				using (FilterReader filterReader = new FilterReader(temp.FileName))
				{
					string content = filterReader.ReadToEnd();
					propertyBag.Text = content.Trim();
				}
			}
		}

Beispiel #9

0

Datei anzeigen

Datei: IndexerDemo.cs Projekt: senzacionale/ncrawler

		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			string textContent = propertyBag.Text; // Filtered text content

			// Here you can send downloaded filtered content to an index, database, filesystem or whatever
			Console.Out.WriteLine(textContent);
		}

Beispiel #10

0

Datei anzeigen

Datei: IntegrationTests.cs Projekt: aliostad/CyberInsekt

 public void TestMe()
 {
     throw new NotImplementedException();
     var crawler = new Crawler();
     crawler.Crawl("http://google.com");
     Thread.Sleep(3000);
 }

Beispiel #11

0

Datei anzeigen

Datei: IFilterProcessor.cs Projekt: bormaxi/NCrawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            string extension = MapContentTypeToExtension(propertyBag.ContentType);
            if (extension.IsNullOrEmpty())
            {
                return;
            }

            propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
            using (TempFile temp = new TempFile())
            {
                temp.FileName += "." + extension;
                File.WriteAllBytes(temp.FileName, propertyBag.Response);
                using (FilterReader filterReader = new FilterReader(temp.FileName))
                {
                    string content = filterReader.ReadToEnd();
                    propertyBag.Text = content.Trim();
                }
            }
        }

Beispiel #12

0

Datei anzeigen

Datei: SimpleCrawlDemo.cs Projekt: fzhenmei/study

        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("Simple crawl demo");

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"),
                new HtmlDocumentProcessor(), // Process html
                new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction
                new GoogleLanguageDetection(), // Add language detection
                new Mp3FileProcessor(), // Add language detection
                new DumperStep())
                {
                    // Custom step to visualize crawl
                    MaximumThreadCount = 10,
                    MaximumCrawlDepth = 10,
                    ExcludeFilter = Program.ExtensionsToSkip,
                })
            {
                // Begin crawl
                c.Crawl();
            }
        }

Beispiel #13

0

Datei anzeigen

Datei: AdvancedCrawlDemo.cs Projekt: fzhenmei/study

        public override bool IsExternalUrl(Uri uri)
        {
            // Is External Url
            if (!base.IsExternalUrl(uri))
            {
                return false;
            }

            // Yes, check if we have crawled it before
            if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(m_Crawler.UriSensitivity)))
            {
                return true;
            }

            // Create child crawler to traverse external site with max 2 levels
            using (Crawler externalCrawler = new Crawler(uri,
                new HtmlDocumentProcessor(), // Process html
                new DumperStep())
                {
                    MaximumThreadCount = 1,
                    MaximumCrawlDepth = 2,
                    MaximumCrawlCount = 10,
                    ExcludeFilter = Program.ExtensionsToSkip,
                })
            {
                // Crawl external site
                externalCrawler.Crawl();
            }

            // Do not follow link on this crawler
            return true;
        }

Beispiel #14

0

Datei anzeigen

Datei: FirefoxHtmlDocumentProcessor.cs Projekt: senzacionale/ncrawler

		public override void Process(Crawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define.
				NotNull(crawler, "crawler").
				NotNull(propertyBag, "propertyBag");

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return;
			}

			if (!IsHtmlContent(propertyBag.ContentType))
			{
				return;
			}

			using (GeckoBrowserForm geckoBrowserForm = new GeckoBrowserForm(XulRunnerPath, propertyBag.ResponseUri.ToString()))
			{
				geckoBrowserForm.Show();
				while (!geckoBrowserForm.Done)
				{
					Application.DoEvents();
				}

				propertyBag.GetResponse = () => new MemoryStream(Encoding.UTF8.GetBytes(geckoBrowserForm.DocumentDomHtml));
				base.Process(crawler, propertyBag);
			}
		}

Beispiel #15

0

Datei anzeigen

Datei: CrawlerTests.cs Projekt: kalpesh2804/Harvest

        public void Test()
        {
            var seed = new Uri("http://nyqui.st");

            var cache = new Cache(seed);
            var crawler = new Crawler(cache);
            bool finished = false;

            crawler.OnCompleted += () =>
            {
                Console.WriteLine("[Main] Crawl completed!");
                finished = true;
            };

            crawler.OnPageDownloaded += (page) => { Console.WriteLine("[Main] Got page {0}", page.Url); };

            crawler.Start(seed);
            Console.WriteLine("[Main] Crawler started.");

            while (true)
            {
                if (finished)
                {
                    Assert.True(true);
                    break;
                }
            }
        }

Beispiel #16

0

Datei anzeigen

Datei: PlManager.cs Projekt: alexbikfalvi/InetAnalytics

        /// <summary>
        /// Createa a new PlanetLab manager.
        /// </summary>
        /// <param name="crawler">The crawler.</param>
        public PlManager(Crawler crawler)
        {
            // Validate the arguments.
            if (null == crawler) throw new ArgumentNullException("crawler");

            // Set the crawler.
            this.crawler = crawler;
        }

Beispiel #17

0

Datei anzeigen

Datei: DumperStep.cs Projekt: w3bprof/ttscrawler

        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                Item item = new Item();
                item.Url = propertyBag.Step.Uri.ToString();

                if (item.Url.StartsWith("http://bidvportal.vn/eDocman"))
                {
                    item.Title = propertyBag.Title;

                    string strTarget = t.GetMinimumString(propertyBag.Text, "Chi tiết văn bản", "Nội dung văn bản");
                    item.Text = strTarget;

                    string strNgayPhatHanh = t.GetMinimumString(strTarget, "Ngày phát hành", "Số đi");
                    item.NgayPhatHanh = strNgayPhatHanh.Replace(' ','/');

                    string strSubject = t.GetMinimumString(strTarget, "Trích yếu", "Độ khẩn");
                    item.Subject = strSubject;

                    item.ContentEncoding = propertyBag.ContentEncoding;
                    item.ContentType = propertyBag.ContentType;
                    item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;
                    item.Depth = propertyBag.Step.Depth;
                    //item.CultureDisplayValue = cultureDisplayValue;

                    string[] strSplit = { "/" };
                    int day = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[0]);
                    int month = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]);
                    int year = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]);

                    if ((DateTime.Now.Year == year) && (DateTime.Now.Month == month) && (DateTime.Now.Day == day))
                    {
                        db.AddToItems(item);
                    }
                }
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                Console.WriteLine("=====================================================");
                Console.WriteLine(ex.Message);
            }
        }

Beispiel #18

0

Datei anzeigen

Datei: FindBrokenLinksDemo.cs Projekt: senzacionale/ncrawler

		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				Console.Out.WriteLine("Url '{0}' referenced from {1} returned with statuscode {2}",
					propertyBag.Step.Uri, propertyBag.OriginalReferrerUrl, propertyBag.StatusCode);
				Console.Out.WriteLine();
			}
		}

Beispiel #19

0

Datei anzeigen

Datei: CrawlerSettings.cs Projekt: underko/DomainCrawler

        public CrawlerSettings(Crawler c)
        {
            InitializeComponent();

            crawler = c;

            update_computer_file_textbox();
            update_data_folder_textbox();
        }

Beispiel #20

0

Datei anzeigen

Datei: Program.cs Projekt: shaimaasultan/WebCrawler

        static void Main(string[] args)
        {
            Crawler crawler = new Crawler();
            IObservable<Uri> observable1 = crawler.Crawl(new Uri("http://www.codinghorror.com/"));

            observable1.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed"));

            Console.ReadLine();
        }

Beispiel #21

0

Datei anzeigen

Datei: CrawlerRulesService.cs Projekt: senzacionale/ncrawler

		public CrawlerRulesService(Crawler crawler, IRobot robot, Uri baseUri)
		{
			AspectF.Define.
				NotNull(crawler, "crawler").
				NotNull(robot, "robot").
				NotNull(baseUri, "baseUri");

			m_Crawler = crawler;
			m_Robot = robot;
			m_BaseUri = baseUri;
		}

Beispiel #22

0

Datei anzeigen

Datei: GoogleLanguageDetection.cs Projekt: w3bprof/ttscrawler

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            string content = propertyBag.Text;
            if (content.IsNullOrEmpty())
            {
                return;
            }

            string contentLookupText = content.Max(MaxPostSize);
            string encodedRequestUrlFragment =
                "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q={0}".FormatWith(contentLookupText);

            m_Logger.Verbose("Google language detection using: {0}", encodedRequestUrlFragment);

            try
            {
                IWebDownloader downloader = NCrawlerModule.Container.Resolve<IWebDownloader>();
                PropertyBag result = downloader.Download(new CrawlStep(new Uri(encodedRequestUrlFragment), 0), null, DownloadMethod.GET);
                if (result.IsNull())
                {
                    return;
                }

                using (Stream responseReader = result.GetResponse())
                using (StreamReader reader = new StreamReader(responseReader))
                {
                    string json = reader.ReadLine();
                    using (MemoryStream ms = new MemoryStream(Encoding.Unicode.GetBytes(json)))
                    {
                        DataContractJsonSerializer ser =
                            new DataContractJsonSerializer(typeof (LanguageDetector));
                        LanguageDetector detector = ser.ReadObject(ms) as LanguageDetector;

                        if (!detector.IsNull())
                        {
                            CultureInfo culture = CultureInfo.GetCultureInfo(detector.responseData.language);
                            propertyBag["Language"].Value = detector.responseData.language;
                            propertyBag["LanguageCulture"].Value = culture;
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                m_Logger.Error("Error during google language detection, the error was: {0}", ex.ToString());
            }
        }

Beispiel #23

0

Datei anzeigen

Datei: Program.cs Projekt: bolthar/tenteikura

 //This is a basic example of how to use the crawler
 //In case of a cache miss, it prints out the page's title and
 //absolute URI, and saves the page data to the filesystem.
 public static void Main(String[] args)
 {
     if ((args.Length == 2 || args.Length == 3) &&
         Uri.IsWellFormedUriString(args[0], UriKind.Absolute))
     {
         Uri startingUri = new Uri(args[0]);
         String targetDirectoryPath = args[1];
         bool followExternal =
             args.Length == 3 && args[2] == "--follow-external";
         Console.WriteLine("Loading from cache...");
         Cache cache = new Cache(startingUri, targetDirectoryPath);
         Console.WriteLine(
             "Cache loaded - {0} pages stored in cache", cache.Count());
         Crawler crawler = new Crawler(cache, followExternal);
         Persister persister = new Persister(targetDirectoryPath, startingUri);
         //This event is fired when the crawler's process is over
         crawler.WorkComplete += () =>
         {
             Environment.Exit(0);
         };
         //This event is fired every time a valid page is downloaded
         crawler.NewPageFetched += (page) =>
         {
             Console.WriteLine(page.Title + " - " + page.Uri.AbsoluteUri);
             persister.Save(page);
         };
         //starts the crawler, on a different thread
         crawler.Crawl(startingUri);
         Console.WriteLine("Crawler started, press CTRL+C to interrupt");
         while (true) { }
     }
     else
     {
         Console.WriteLine("Crawler");
         Console.WriteLine("Usage:");
         Console.WriteLine(
             "Tenteikura.Example.exe <starting_uri> <target_directory> [--options]");
         Console.WriteLine(
             "<starting_uri> : a valid absolute URL which will be the starting point for the crawler");
         Console.WriteLine(
             "<target_directory> : the directory where the page files will be saved");
         Console.WriteLine("");
         Console.WriteLine("OPTIONS:");
         Console.WriteLine(
             "The only option available is --follow-external, which will make the crawler fetch non-local urls as well");
         Console.WriteLine("EXAMPLE: ");
         Console.WriteLine(
             @"Tenteikura.Example.exe http://telenor.com C:\mytargetdirectory --follow-external");
     }
 }

Beispiel #24

0

Datei anzeigen

Datei: ApplicationIntegration.cs Projekt: aclemmensen/TinyCQRS

        public ApplicationIntegration(string name, string root)
        {
            _name = name;
            _root = root;
            var container = new WindsorContainer();
            container.Install(new DatabaseServiceInstaller());

            container.Register(Component.For<ILogger>().ImplementedBy<ConsoleLogger>().IsDefault());

            _siteService = container.Resolve<ISiteService>();
            _setupService = container.Resolve<ISetupService>();
            _crawlService = container.Resolve<ICrawlService>();

            _crawler = new Crawler(_crawlService, container.Resolve<ILogger>());
        }

Beispiel #25

0

Datei anzeigen

Datei: CrawlerMapping.cs Projekt: RamonEsteveCuevas/Matchy

    public MatchyBackend.Crawler mapToService(Crawler crawler)
    {
        MatchyBackend.Crawler result = new MatchyBackend.Crawler();

        if (crawler != null)
        {
            return new MatchyBackend.Crawler()
            {
                crawler_ID = crawler.crawler_ID,
                Description = crawler.Description
            };
        }
        else
            return result;
    }

Beispiel #26

0

Datei anzeigen

Datei: SitemapProcessor.cs Projekt: osamede/social_listen

        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return;
            }

            using (Stream reader = propertyBag.GetResponse())
            using (StreamReader sr = new StreamReader(reader))
            {
                XDocument mydoc = XDocument.Load(sr);
                if (mydoc.Root == null)
                {
                    return;
                }

                XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                IEnumerable<string> urlNodes =
                    from e in mydoc.Descendants(qualifiedName)
                    where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                    select e.Value;

                foreach (string url in urlNodes)
                {
                    // add new crawler steps
                    string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                    string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url);
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);

                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }

                    crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                        propertyBag.Step, new Dictionary<string, object>
                            {
                                {Resources.PropertyBagKeyOriginalUrl, url},
                                {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                            });
                }
            }
        }

Beispiel #27

0

Datei anzeigen

Datei: Program.cs Projekt: aliostad/CyberInsekt

        static void Main(string[] args)
        {
            ThreadPool.SetMinThreads(200, 200);
            var crawler = new Crawler();
            var fileStore = new FileStore(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Cache"));
            var cachingHandler = new CachingHandler(fileStore)
                {
                    InnerHandler = new HttpClientHandler()
                };

            crawler.Requester = new HttpClient(cachingHandler);
            if(ConfigurationManager.ConnectionStrings["UrlStore"]!=null)
                crawler.Store = new UrlStore();
            crawler.Crawl("http://YAHOO.COM");
            Console.Read();
        }

Beispiel #28

0

Datei anzeigen

Datei: CrawlerMapping.cs Projekt: RamonEsteveCuevas/Matchy

 public Crawler mapFromService(MatchyBackend.Crawler crawler)
 {
     if (crawler.crawler_ID != 0)
     {
         return new Crawler()
         {
             crawler_ID = crawler.crawler_ID,
             Description = crawler.Description
         };
     }
     else
     {
         Crawler result = new Crawler();
         return result;
     }
 }

Beispiel #29

0

Datei anzeigen

Datei: EMailEntityExtractionProcessor.cs Projekt: senzacionale/ncrawler

		/// <summary>
		/// </summary>
		/// <param name="crawler">
		/// The crawler.
		/// </param>
		/// <param name="propertyBag">
		/// The property bag.
		/// </param>
		public void Process(Crawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define.
				NotNull(crawler, "crawler").
				NotNull(propertyBag, "propertyBag");

			string text = propertyBag.Text;
			if (text.IsNullOrEmpty())
			{
				return;
			}

			MatchCollection matches = s_EmailRegex.Value.Matches(text);
			propertyBag["Email"].Value = matches.Cast<Match>().
				Select(match => match.Value).
				Join(";");
		}

Beispiel #30

0

Datei anzeigen

Datei: Program.cs Projekt: repne/happyface

 // ReSharper restore InconsistentNaming
 private static bool Handler(Crawler crawler, CtrlType sig)
 {
     switch (sig)
     {
         case CtrlType.CTRL_C_EVENT:
         case CtrlType.CTRL_LOGOFF_EVENT:
         case CtrlType.CTRL_SHUTDOWN_EVENT:
         case CtrlType.CTRL_CLOSE_EVENT:
             System.Console.WriteLine("Closing...");
             _store.Dispose();
             _frontier.Dispose();
             crawler.Stop();
             return false;
         default:
             return true;
     }
 }

Beispiel #31

0

Datei anzeigen

Datei: FileProcessor.cs Projekt: Ritor42/LIB-Crawler

 /// <summary>
 /// Initializes a new instance of the <see cref="FileProcessor"/> class.
 /// </summary>
 /// <param name="requests">Requests.</param>
 public FileProcessor(Crawler crawler, IProducerConsumerCollection <Request <SitemapItem> > requests)
 {
     this.crawler  = crawler;
     this.requests = requests;
 }

C# (CSharp) Crawler, ChiakiYu Beispiele