public ActionResult Create(Crawler.Entity.Task task) { if (ModelState.IsValid) { task.ID = Guid.NewGuid(); //db.Tasks.Add(task); //db.SaveChanges(); taskService.Create(task); return RedirectToAction("Index"); } ViewBag.CategoryId = new SelectList(categoryService.Table.ToList(), "ID", "Name", task.CategoryId);//db.TaskCategories return View(task); //try //{ // // TODO: Add insert logic here // return RedirectToAction("Index"); //} //catch //{ // return View(); //} }
public override void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } string documentDomHtml = string.Empty; Thread tempThread = new Thread(o => { using (TridentBrowserForm internetExplorer = new TridentBrowserForm(propertyBag.ResponseUri.ToString())) { Application.Run(internetExplorer); documentDomHtml = internetExplorer.DocumentDomHtml; } }); tempThread.SetApartmentState(ApartmentState.STA); tempThread.Start(); tempThread.Join(); propertyBag.GetResponse = () => new MemoryStream(Encoding.UTF8.GetBytes(documentDomHtml)); base.Process(crawler, propertyBag); }
public ActionResult Create(Crawler.Entity.TaskItem item, string createAddCaptureRule, string captureRuleForContent, string captureRuleForNavigation) { if (ModelState.IsValid) { item.ID = Guid.NewGuid(); if (!string.IsNullOrEmpty(captureRuleForContent)) { Crawler.Entity.CaptureRule captureRuleCon = new Crawler.Entity.CaptureRule { ID = new Guid(captureRuleForContent) }; captureRuleService.Context.Set<Crawler.Entity.CaptureRule>().Attach(captureRuleCon); item.CaptureRules.Add(captureRuleCon); } if (!string.IsNullOrEmpty(captureRuleForNavigation)) { CaptureRule captureRuleNav = new CaptureRule { ID = new Guid(captureRuleForNavigation) }; captureRuleService.Context.Set<CaptureRule>().Attach(captureRuleNav); item.CaptureRules.Add(captureRuleNav); } taskItemService.Create(item); if (string.IsNullOrEmpty(createAddCaptureRule)) { return RedirectToAction("Index", new { taskid = item.TaskId }); } else { return RedirectToAction("Create", "CaptureRule", new { taskitemid = item.ID }); } } ViewBag.TaskId = new SelectList(taskService.GetAll(), "ID", "Name", item.TaskId); ViewBag.PageCategory = new SelectList(DictionaryDataUtil.GetData("pageType"), "ID", "Name", 1); return View(item); }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } using (TempFile tempFile = new TempFile()) { using (FileStream fs = new FileStream(tempFile.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000)) using (Stream input = propertyBag.GetResponse()) { input.CopyToStream(fs); } UltraID3 id3 = new UltraID3(); id3.Read(tempFile.FileName); propertyBag["MP3_Album"].Value = id3.Album; propertyBag["MP3_Artist"].Value = id3.Artist; propertyBag["MP3_Comments"].Value = id3.Comments; propertyBag["MP3_Duration"].Value = id3.Duration; propertyBag["MP3_Genre"].Value = id3.Genre; propertyBag["MP3_Title"].Value = id3.Title; } }
public static void Run() { NCrawlerModule.Setup(); // Register new implementation for ICrawlerRules using our custom class CustomCrawlerRules defined below NCrawlerModule.Register(builder => builder.Register((c, p) => { NCrawlerModule.Setup(); // Return to standard setup return new CustomCrawlerRules(p.TypedAs<Crawler>(), c.Resolve<IRobot>(p), p.TypedAs<Uri>(), p.TypedAs<ICrawlerHistory>()); }). As<ICrawlerRules>(). InstancePerDependency()); Console.Out.WriteLine("Advanced crawl demo"); using (Crawler c = new Crawler( new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 2, MaximumCrawlDepth = 2, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
/// <summary> /// Creates a new spider collection instance. /// </summary> /// <param name="crawler">The crawler object.</param> public Spiders(Crawler crawler) { this.crawler = crawler; // Create the spiders. this.spiderStandardFeeds = new SpiderStandardFeeds(this.crawler); }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("\nSimple indexer demo"); // Setup crawler to crawl/index http://ncrawler.codeplex.com // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Custom step, that is supposed to send content to an Index or Database using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor( // Process html, filter links and content // Setup filter that removed all the text between <body and </body> // This can be custom tags like <!--BeginTextFiler--> and <!--EndTextFiler--> // or whatever you prefer. This way you can control what text is extracted on every page // Most cases you want just to filter the header information or menu text new Dictionary<string, string> { {"<body", "</body>"} }, // Setup filter that tells the crawler not to follow links between tags // that start with <head and ends with </head>. This can be custom tags like // <!--BeginNoFollow--> and <!--EndNoFollow--> or whatever you prefer. // This was you can control what links the crawler should not follow new Dictionary<string, string> { {"<head", "</head>"} }), new IndexerDemo()) { MaximumThreadCount = 2 }) // Custom Step to send filtered content to index { // Begin crawl c.Crawl(); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } string extension = MapContentTypeToExtension(propertyBag.ContentType); if (extension.IsNullOrEmpty()) { return; } propertyBag.Title = propertyBag.Step.Uri.PathAndQuery; using (TempFile temp = new TempFile()) { temp.FileName += "." + extension; using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000)) using (Stream input = propertyBag.GetResponse()) { input.CopyToStream(fs); } using (FilterReader filterReader = new FilterReader(temp.FileName)) { string content = filterReader.ReadToEnd(); propertyBag.Text = content.Trim(); } } }
public void Process(Crawler crawler, PropertyBag propertyBag) { string textContent = propertyBag.Text; // Filtered text content // Here you can send downloaded filtered content to an index, database, filesystem or whatever Console.Out.WriteLine(textContent); }
public void TestMe() { throw new NotImplementedException(); var crawler = new Crawler(); crawler.Crawl("http://google.com"); Thread.Sleep(3000); }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } string extension = MapContentTypeToExtension(propertyBag.ContentType); if (extension.IsNullOrEmpty()) { return; } propertyBag.Title = propertyBag.Step.Uri.PathAndQuery; using (TempFile temp = new TempFile()) { temp.FileName += "." + extension; File.WriteAllBytes(temp.FileName, propertyBag.Response); using (FilterReader filterReader = new FilterReader(temp.FileName)) { string content = filterReader.ReadToEnd(); propertyBag.Text = content.Trim(); } } }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("Simple crawl demo"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), // Process html new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction new GoogleLanguageDetection(), // Add language detection new Mp3FileProcessor(), // Add language detection new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 10, MaximumCrawlDepth = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public override bool IsExternalUrl(Uri uri) { // Is External Url if (!base.IsExternalUrl(uri)) { return false; } // Yes, check if we have crawled it before if (!m_CrawlerHistory.Register(uri.GetUrlKeyString(m_Crawler.UriSensitivity))) { return true; } // Create child crawler to traverse external site with max 2 levels using (Crawler externalCrawler = new Crawler(uri, new HtmlDocumentProcessor(), // Process html new DumperStep()) { MaximumThreadCount = 1, MaximumCrawlDepth = 2, MaximumCrawlCount = 10, ExcludeFilter = Program.ExtensionsToSkip, }) { // Crawl external site externalCrawler.Crawl(); } // Do not follow link on this crawler return true; }
public override void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } using (GeckoBrowserForm geckoBrowserForm = new GeckoBrowserForm(XulRunnerPath, propertyBag.ResponseUri.ToString())) { geckoBrowserForm.Show(); while (!geckoBrowserForm.Done) { Application.DoEvents(); } propertyBag.GetResponse = () => new MemoryStream(Encoding.UTF8.GetBytes(geckoBrowserForm.DocumentDomHtml)); base.Process(crawler, propertyBag); } }
public void Test() { var seed = new Uri("http://nyqui.st"); var cache = new Cache(seed); var crawler = new Crawler(cache); bool finished = false; crawler.OnCompleted += () => { Console.WriteLine("[Main] Crawl completed!"); finished = true; }; crawler.OnPageDownloaded += (page) => { Console.WriteLine("[Main] Got page {0}", page.Url); }; crawler.Start(seed); Console.WriteLine("[Main] Crawler started."); while (true) { if (finished) { Assert.True(true); break; } } }
/// <summary> /// Createa a new PlanetLab manager. /// </summary> /// <param name="crawler">The crawler.</param> public PlManager(Crawler crawler) { // Validate the arguments. if (null == crawler) throw new ArgumentNullException("crawler"); // Set the crawler. this.crawler = crawler; }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { Item item = new Item(); item.Url = propertyBag.Step.Uri.ToString(); if (item.Url.StartsWith("http://bidvportal.vn/eDocman")) { item.Title = propertyBag.Title; string strTarget = t.GetMinimumString(propertyBag.Text, "Chi tiết văn bản", "Nội dung văn bản"); item.Text = strTarget; string strNgayPhatHanh = t.GetMinimumString(strTarget, "Ngày phát hành", "Số đi"); item.NgayPhatHanh = strNgayPhatHanh.Replace(' ','/'); string strSubject = t.GetMinimumString(strTarget, "Trích yếu", "Độ khẩn"); item.Subject = strSubject; item.ContentEncoding = propertyBag.ContentEncoding; item.ContentType = propertyBag.ContentType; item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; item.Depth = propertyBag.Step.Depth; //item.CultureDisplayValue = cultureDisplayValue; string[] strSplit = { "/" }; int day = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[0]); int month = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]); int year = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]); if ((DateTime.Now.Year == year) && (DateTime.Now.Month == month) && (DateTime.Now.Day == day)) { db.AddToItems(item); } } } try { db.SaveChanges(); } catch (Exception ex) { Console.WriteLine("====================================================="); Console.WriteLine(ex.Message); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { Console.Out.WriteLine("Url '{0}' referenced from {1} returned with statuscode {2}", propertyBag.Step.Uri, propertyBag.OriginalReferrerUrl, propertyBag.StatusCode); Console.Out.WriteLine(); } }
public CrawlerSettings(Crawler c) { InitializeComponent(); crawler = c; update_computer_file_textbox(); update_data_folder_textbox(); }
static void Main(string[] args) { Crawler crawler = new Crawler(); IObservable<Uri> observable1 = crawler.Crawl(new Uri("http://www.codinghorror.com/")); observable1.Subscribe(onNext: Console.WriteLine, onCompleted: () => Console.WriteLine("Crawling completed")); Console.ReadLine(); }
public CrawlerRulesService(Crawler crawler, IRobot robot, Uri baseUri) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(robot, "robot"). NotNull(baseUri, "baseUri"); m_Crawler = crawler; m_Robot = robot; m_BaseUri = baseUri; }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string content = propertyBag.Text; if (content.IsNullOrEmpty()) { return; } string contentLookupText = content.Max(MaxPostSize); string encodedRequestUrlFragment = "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q={0}".FormatWith(contentLookupText); m_Logger.Verbose("Google language detection using: {0}", encodedRequestUrlFragment); try { IWebDownloader downloader = NCrawlerModule.Container.Resolve<IWebDownloader>(); PropertyBag result = downloader.Download(new CrawlStep(new Uri(encodedRequestUrlFragment), 0), null, DownloadMethod.GET); if (result.IsNull()) { return; } using (Stream responseReader = result.GetResponse()) using (StreamReader reader = new StreamReader(responseReader)) { string json = reader.ReadLine(); using (MemoryStream ms = new MemoryStream(Encoding.Unicode.GetBytes(json))) { DataContractJsonSerializer ser = new DataContractJsonSerializer(typeof (LanguageDetector)); LanguageDetector detector = ser.ReadObject(ms) as LanguageDetector; if (!detector.IsNull()) { CultureInfo culture = CultureInfo.GetCultureInfo(detector.responseData.language); propertyBag["Language"].Value = detector.responseData.language; propertyBag["LanguageCulture"].Value = culture; } } } } catch (Exception ex) { m_Logger.Error("Error during google language detection, the error was: {0}", ex.ToString()); } }
//This is a basic example of how to use the crawler //In case of a cache miss, it prints out the page's title and //absolute URI, and saves the page data to the filesystem. public static void Main(String[] args) { if ((args.Length == 2 || args.Length == 3) && Uri.IsWellFormedUriString(args[0], UriKind.Absolute)) { Uri startingUri = new Uri(args[0]); String targetDirectoryPath = args[1]; bool followExternal = args.Length == 3 && args[2] == "--follow-external"; Console.WriteLine("Loading from cache..."); Cache cache = new Cache(startingUri, targetDirectoryPath); Console.WriteLine( "Cache loaded - {0} pages stored in cache", cache.Count()); Crawler crawler = new Crawler(cache, followExternal); Persister persister = new Persister(targetDirectoryPath, startingUri); //This event is fired when the crawler's process is over crawler.WorkComplete += () => { Environment.Exit(0); }; //This event is fired every time a valid page is downloaded crawler.NewPageFetched += (page) => { Console.WriteLine(page.Title + " - " + page.Uri.AbsoluteUri); persister.Save(page); }; //starts the crawler, on a different thread crawler.Crawl(startingUri); Console.WriteLine("Crawler started, press CTRL+C to interrupt"); while (true) { } } else { Console.WriteLine("Crawler"); Console.WriteLine("Usage:"); Console.WriteLine( "Tenteikura.Example.exe <starting_uri> <target_directory> [--options]"); Console.WriteLine( "<starting_uri> : a valid absolute URL which will be the starting point for the crawler"); Console.WriteLine( "<target_directory> : the directory where the page files will be saved"); Console.WriteLine(""); Console.WriteLine("OPTIONS:"); Console.WriteLine( "The only option available is --follow-external, which will make the crawler fetch non-local urls as well"); Console.WriteLine("EXAMPLE: "); Console.WriteLine( @"Tenteikura.Example.exe http://telenor.com C:\mytargetdirectory --follow-external"); } }
public ApplicationIntegration(string name, string root) { _name = name; _root = root; var container = new WindsorContainer(); container.Install(new DatabaseServiceInstaller()); container.Register(Component.For<ILogger>().ImplementedBy<ConsoleLogger>().IsDefault()); _siteService = container.Resolve<ISiteService>(); _setupService = container.Resolve<ISetupService>(); _crawlService = container.Resolve<ICrawlService>(); _crawler = new Crawler(_crawlService, container.Resolve<ILogger>()); }
public MatchyBackend.Crawler mapToService(Crawler crawler) { MatchyBackend.Crawler result = new MatchyBackend.Crawler(); if (crawler != null) { return new MatchyBackend.Crawler() { crawler_ID = crawler.crawler_ID, Description = crawler.Description }; } else return result; }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsXmlContent(propertyBag.ContentType)) { return; } using (Stream reader = propertyBag.GetResponse()) using (StreamReader sr = new StreamReader(reader)) { XDocument mydoc = XDocument.Load(sr); if (mydoc.Root == null) { return; } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable<string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, url}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } } }
static void Main(string[] args) { ThreadPool.SetMinThreads(200, 200); var crawler = new Crawler(); var fileStore = new FileStore(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Cache")); var cachingHandler = new CachingHandler(fileStore) { InnerHandler = new HttpClientHandler() }; crawler.Requester = new HttpClient(cachingHandler); if(ConfigurationManager.ConnectionStrings["UrlStore"]!=null) crawler.Store = new UrlStore(); crawler.Crawl("http://YAHOO.COM"); Console.Read(); }
public Crawler mapFromService(MatchyBackend.Crawler crawler) { if (crawler.crawler_ID != 0) { return new Crawler() { crawler_ID = crawler.crawler_ID, Description = crawler.Description }; } else { Crawler result = new Crawler(); return result; } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string text = propertyBag.Text; if (text.IsNullOrEmpty()) { return; } MatchCollection matches = s_EmailRegex.Value.Matches(text); propertyBag["Email"].Value = matches.Cast<Match>(). Select(match => match.Value). Join(";"); }
// ReSharper restore InconsistentNaming private static bool Handler(Crawler crawler, CtrlType sig) { switch (sig) { case CtrlType.CTRL_C_EVENT: case CtrlType.CTRL_LOGOFF_EVENT: case CtrlType.CTRL_SHUTDOWN_EVENT: case CtrlType.CTRL_CLOSE_EVENT: System.Console.WriteLine("Closing..."); _store.Dispose(); _frontier.Dispose(); crawler.Stop(); return false; default: return true; } }
/// <summary> /// Initializes a new instance of the <see cref="FileProcessor"/> class. /// </summary> /// <param name="requests">Requests.</param> public FileProcessor(Crawler crawler, IProducerConsumerCollection <Request <SitemapItem> > requests) { this.crawler = crawler; this.requests = requests; }