public void Consume_PageProcessorThrowsException_DoesNotCrash() { //Arrange CrawlResult fakeResult = new CrawlResult { CrawlContext = GetCrawlContext(_dummyCrawlProcessors) }; CrawledPage crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://www.adamthings.com")); _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult) .Callback(() => _fakeWebCrawler .Raise(f => f.PageCrawlCompleted += null, new PageCrawlCompletedArgs(GetCrawlContext(_dummyCrawlProcessors), crawledPage))); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); _fakeProcessor1.Setup(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), crawledPage)).Throws(new Exception("oh no page")); //Act _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://www.adamthings.com") }, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); }
/// <summary> /// Crawl and parse data from a file. /// </summary> /// <param name="filename">Path and filename.</param> /// <returns>Parse result.</returns> public ParseResult ParseFromFile(string filename) { if (String.IsNullOrEmpty(filename)) { throw new ArgumentNullException(nameof(filename)); } ParseResult ret = new ParseResult(); ret.Json = new ParseResult.JsonParseResult(); FileCrawler crawler = new FileCrawler(filename); CrawlResult cr = crawler.Get(); if (!cr.Success) { ret.Time.End = DateTime.UtcNow; return(ret); } byte[] sourceData = cr.Data; string sourceContent = Encoding.UTF8.GetString(sourceData); return(ProcessSourceContent(sourceContent)); }
/// <summary> /// Download the object to the supplied filename. /// </summary> /// <param name="filename">The filename where the object should be saved.</param> /// <returns>Crawl result.</returns> public CrawlResult Download(string filename) { if (String.IsNullOrEmpty(filename)) { throw new ArgumentNullException(nameof(filename)); } CrawlResult ret = new CrawlResult(); try { ret.Metadata = CrawlResult.ObjectMetadata.FromFileInfo(new FileInfo(Filename)); ret.ContentLength = new FileInfo(Filename).Length; using (FileStream source = new FileStream(Filename, FileMode.Open, FileAccess.Read)) { using (FileStream target = new FileStream(filename, FileMode.CreateNew, FileAccess.ReadWrite)) { source.CopyTo(target); } } ret.DataStream = null; ret.Success = true; } catch (Exception e) { ret.Exception = e; } ret.Time.End = DateTime.UtcNow; return(ret); }
private static void Main(string[] args) { try { Uri uriToCrawl = GetSiteToCrawl(); // I'm using the default crawler var crawler = new PoliteWebCrawler(); // I need to subscribe to this event in order to process pages that have been crawled crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted; // Start the crawl CrawlResult crawlResult = crawler.Crawl(uriToCrawl); // Generate report Task <ReportResult> reportTask = GenerateReport(); PrintResultInformation(reportTask.Result); } catch (Exception ex) { System.Console.ForegroundColor = ConsoleColor.Red; System.Console.WriteLine("There was an error when trying to crawl page."); System.Console.Write(ex); System.Console.ReadKey(); } }
public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements) { Directory.CreateDirectory(@"c:\temp\WebScraper"); using (var client = new HttpClient()) { foreach (var img in elements.SelectMany(e => e.QuerySelectorAll("img"))) { var src = new Uri(crawlResult.RequestUrl, new Uri(img.Attributes["src"].Value, UriKind.RelativeOrAbsolute)); var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(src.LocalPath)); if (File.Exists(fileName) == false) { Console.WriteLine($"Downloading {src} to {fileName}"); using (var f = File.OpenWrite(fileName)) using (var s = client.GetStreamAsync(src).Result) { s.CopyTo(f); } } else { Console.WriteLine($"Skipping download of {src} to {fileName}"); } img.SetAttribute("data-local-src", fileName); } } return(elements.Select(this.htmlSelector).Aggregate((prod, next) => prod + "\n" + next)); }
static void SqlCrawler() { string query = Common.InputString("Query:", null, true); if (String.IsNullOrEmpty(query)) { return; } DbSettings db = new DbSettings( (DbType)(Enum.Parse(typeof(DbType), Common.InputString("DB type:", "Mysql", false))), Common.InputString("Hostname:", "localhost", false), Common.InputInteger("Port:", 3306, true, false), Common.InputString("Username:"******"root", false), Common.InputString("Password:"******"password", false), Common.InputString("Instance:", null, true), Common.InputString("Database name:", "dbname", false)); SqlCrawler sc = new SqlCrawler(db, query); CrawlResult cr = sc.Get(); Console.WriteLine("Success : " + cr.Success); Console.WriteLine("Start time : " + cr.Time.Start.ToString()); Console.WriteLine("End time : " + cr.Time.End.ToString()); Console.WriteLine("Total ms : " + cr.Time.TotalMs.ToString() + "ms"); Console.WriteLine("Data : "); if (cr.DataTable != null) { Console.WriteLine(Common.SerializeJson(Common.DataTableToListDynamic(cr.DataTable), true)); } else { Console.WriteLine(" (null)"); } }
static void S3Crawler() { string endpoint = Common.InputString("Endpoint:", null, true); bool ssl = Common.InputBoolean("SSL:", true); string bucket = Common.InputString("Bucket:", null, false); string key = Common.InputString("Key:", null, false); string accessKey = Common.InputString("Access Key:", null, false); string secretKey = Common.InputString("Secret Key:", null, false); AwsRegion region = (AwsRegion)(Enum.Parse(typeof(AwsRegion), Common.InputString("Region:", "USWest1", false))); string baseUrl = Common.InputString("Base URL:", "http://localhost:8000/{bucket}/{key}", false); S3Crawler s3c = null; if (!String.IsNullOrEmpty(endpoint)) { s3c = new S3Crawler(endpoint, ssl, bucket, key, accessKey, secretKey, region, baseUrl); } else { s3c = new S3Crawler(bucket, key, accessKey, secretKey, region); } CrawlResult cr = s3c.Get(); Console.WriteLine("Success : " + cr.Success); Console.WriteLine("Start time : " + cr.Time.Start.ToString()); Console.WriteLine("End time : " + cr.Time.End.ToString()); Console.WriteLine("Total ms : " + cr.Time.TotalMs.ToString() + "ms"); Console.WriteLine("Content length : " + cr.ContentLength + " bytes"); Console.WriteLine("Metadata : " + Common.SerializeJson(cr.Metadata, false)); Console.WriteLine("Data :" + Environment.NewLine + Encoding.UTF8.GetString(cr.Data)); }
static void Main(string[] args) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 1; crawlConfig.MaxPagesToCrawl = 1; PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; //crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("http://www.kmhk.kmu.edu.tw/news/list.asp?P_classify=9")); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
public CrawlResult Crawl() { IWebCrawler crawler = InitCrawler(); Uri uriToCrawl = new Uri("http://rabota.ua/jobsearch/vacancy_list"); //http://rabota.ua/jobsearch/vacancy_list?pg=1000 crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains(@"rabota.ua/jobsearch/vacancy_list") && !pageToCrawl.Uri.AbsoluteUri.Contains(@"period")) { return new CrawlDecision { Allow = true } } ; return(new CrawlDecision { Allow = false, Reason = "Parse only job pages" }); }); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(uriToCrawl); return(result); }
public void Consume_ValidDomain_CrawlerCrawlBagSet() { //Arrange Domain domain = new Domain { DomainId = 1, Uri = new Uri("http://a.com") }; CrawlContext context = GetCrawlContext(_dummyCrawlProcessors); CrawlResult fakeResult = new CrawlResult { CrawlContext = context }; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); //Act DomainCrawlResult result = _uut.Consume(domain, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); Assert.AreEqual(domain, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.Domain); Assert.AreEqual(_dummyProcessorContext.PrimaryPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.PrimaryPersistenceProvider); Assert.AreEqual(_dummyProcessorContext.BackupPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.BackupPersistenceProvider); Assert.AreEqual(_dummyCrawlProcessors, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.CrawlProcessors); }
/// <summary> /// 运行爬虫 /// </summary> public void StartCrawl() { //设置爬虫 PoliteWebCrawler crawler = new PoliteWebCrawler(); //设置爬取条件 crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; //开始爬取 CrawlResult result = crawler.Crawl(new Uri(link)); //This is synchronous, it will not go to the next line until the crawl has completed //返回结果 if (result.ErrorOccurred) { log.Error("链接" + result.RootUri.AbsoluteUri + "出现差错爬取完成:" + result.ErrorException.Message); Console.WriteLine("链接 {0} 出现差错爬取完成: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { log.Info("链接" + result.RootUri.AbsoluteUri + "无差错爬取完成!"); Console.WriteLine("链接 {0} 无差错爬取完成.", result.RootUri.AbsoluteUri); } flag = false; }
public void Consume_DomainProcessorTimesOut_DoesNotCrash() { //Arrange _dummyConfig.MaxDomainProcessorTimeInMilliSecs = 1000; CrawlResult fakeResult = new CrawlResult { CrawlContext = GetCrawlContext(_dummyCrawlProcessors) }; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); _fakeProcessor1.Setup(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>())) .Callback((CrawlContext cc) => System.Threading.Thread.Sleep(10000)); //ten seconds //Act Stopwatch timer = Stopwatch.StartNew(); _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://a.com") }, _dummyCancellationToken); timer.Stop(); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1)); _fakeProcessor2.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1)); _fakeProcessor3.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1)); Assert.IsTrue(timer.ElapsedMilliseconds > 900); Assert.IsTrue(timer.ElapsedMilliseconds < 2000); }
public void Consume_DomainProcessorThrowsException_DoesNotCrash() { //Arrange CrawlResult fakeResult = new CrawlResult { CrawlContext = GetCrawlContext(_dummyCrawlProcessors) }; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); _fakeProcessor1.Setup(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>())).Throws(new Exception("oh no domain")); //Act _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://a.com") }, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1)); _fakeProcessor2.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1)); _fakeProcessor3.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(1)); }
public void Consume_ValidDomain_AppConfigHttpStatusesToProcessNullWebResponse_CrawlsPerformed() { //Arrange CrawlResult fakeResult = new CrawlResult { CrawlContext = GetCrawlContext(_dummyCrawlProcessors) }; CrawledPage crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://www.adamthings.com")); _dummyConfig.HttpStatusesToProcess = new string[] { }; crawledPage.HttpWebResponse = null; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult) .Callback(() => _fakeWebCrawler .Raise(f => f.PageCrawlCompleted += null, new PageCrawlCompletedArgs(GetCrawlContext(_dummyCrawlProcessors), crawledPage))); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); //Act _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://www.adamthings.com") }, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(1)); }
public bool Crawl(Site s) { m_currentSite = s; shouldAbort = false; Program.Log("Beginning crawl of " + s.Name); Program.Status("Crawling " + s.Name); try { CrawlResult result = crawler.Crawl(new Uri(s.Uri)); if (result.ErrorOccurred) { Program.Log(String.Format("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message)); return(false); } else { Program.Log(String.Format("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri)); } Program.Log(s.Name + " has been crawled and added to Milkshake successfully."); return(true); } catch (Exception e) { Program.Log(e.ToString()); return(false); } }
/// <summary> /// Crawl all pages of the website and convert them to NLPTextDocuments /// </summary> public void ExtractNLPTextDocuments() { Perfs = new PerfMonitor(); DisplayMessages(WriteStartMessage); DisplayMessages(Perfs.WriteStatusHeader); // This is synchronous, it will not go to the next line until the crawl has completed CrawlResult result = crawler.Crawl(ExtractorParams.RootUrl); Perfs.EndTime = DateTime.Now; // Write end status to log file Perfs.WriteStatus(messagesWriter); string endMessage = null; if (result.ErrorOccurred) { endMessage = "Extraction completed with fatal error \"" + result.ErrorException.Message + "\""; } else { endMessage = "Extraction completed"; } DisplayMessages(WriteEndMessage, endMessage); }
/// <summary> /// Crawl, query, and parse data from a SQL database. /// </summary> /// <param name="dbSettings">Database settings.</param> /// <param name="query">Query to execute.</param> /// <returns>Parse result.</returns> public ParseResult ParseFromQuery(DbSettings dbSettings, string query) { if (dbSettings == null) { throw new ArgumentNullException(nameof(dbSettings)); } if (String.IsNullOrEmpty(query)) { throw new ArgumentNullException(nameof(query)); } ParseResult ret = new ParseResult(); ret.Sql = new ParseResult.SqlParseResult(); SqlCrawler crawler = new SqlCrawler(dbSettings, query); CrawlResult cr = crawler.Get(); if (!cr.Success) { ret.Time.End = DateTime.UtcNow; return(ret); } return(ProcessSourceContent(cr.DataTable)); }
public async Task <CrawlResult> Crawl(Uri uri) { var httpClient = new HttpClient(); var result = new CrawlResult(); try { HttpClient client = new HttpClient(); var response = await client.GetAsync(uri); var pageContents = await response.Content.ReadAsStringAsync(); HtmlDocument pageDocument = new HtmlDocument(); pageDocument.LoadHtml(pageContents); var links = pageDocument.DocumentNode.Descendants("a"); var linkedPages = links.Select(a => a.GetAttributeValue("href", null)) .Where(u => !String.IsNullOrEmpty(u)); result.ContainsHtml = true; result.Links = links; result.LinkedPages = linkedPages; } catch { result.ContainsHtml = false; } return(result); }
static void Main(string[] args) { log4net.Config.XmlConfigurator.Configure(); PrintDisclaimer(); Uri uriToCrawl = GetSiteToCrawl(args); IWebCrawler crawler; //Uncomment only one of the following to see that instance in action //crawler = GetDefaultWebCrawler(); //crawler = GetManuallyConfiguredWebCrawler(); crawler = GetCustomBehaviorUsingLambdaWebCrawler(); //Subscribe to any of these asynchronous events, there are also sychronous versions of each. //This is where you process data about specific events of the crawl crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; //Start the crawl //This is a synchronous call CrawlResult result = crawler.Crawl(uriToCrawl); //Now go view the log.txt file that is in the same directory as this executable. It has //all the statements that you were trying to read in the console window :). //Not enough data being logged? Change the app.config file's log4net log level from "INFO" TO "DEBUG" PrintDisclaimer(); Console.ReadKey(); }
public void Crawl(CrawlRequest request) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v1.0 http://code.google.com/p/abot)"; crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); crawlConfig.MaxCrawlDepth = 10; crawlConfig.DownloadableContentTypes = "text/html, text/plain"; //Will use the manually created crawlConfig object created above PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(request.EntryURL)); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
public async Task <CrawlResult> Scrape() { CrawlResult crawlResult = new CrawlResult(); BrowserSettings browserSettings = new BrowserSettings { WindowlessFrameRate = 1 }; using (ChromiumWebBrowser browser = new ChromiumWebBrowser(browserSettings: browserSettings)) { try { await WaitForBrowserInit(browser); await LoadAsync(browser, string.Format(CrawlDescription.SearchString, CrawlDescription.Keyword)); crawlResult.Ads = await BingExtractor.ExtractTextAds(browser); crawlResult.CrawlResultID = CrawlResultID.Success; } catch (Exception ex) { Log.Error("Scraper Exception({0}): {1}", ex.GetType(), ex.Message); crawlResult.CrawlResultID = CrawlResultID.Failure; } } return(crawlResult); }
private void InitializeCommands() { openFileCommand = new CommandImpl(async() => { LinkedList <string> sites = new LinkedList <string>(); sites = processingOpenFileCommand(); foreach (string link in sites) { crawlerHandler.ConsoleOutput = link + "\n"; } if (openFileCommand.CanExecute) { openFileCommand.CanExecute = false; } result = await Task.Run(() => crawler.PerformCrawlingAsync(sites, 0)); openFileCommand.CanExecute = true; crawlerHandler.ConsoleOutput = result.getSite().Addres; }); iAmAliveCommand = new CommandImpl(async() => { ShowIAmAlive(); }); }
//protected void IsBrowserLoaded(object sender, System.EventArgs e) //{ // // No need more calls // Browser.BrowserInitialized -= IsBrowserLoaded; // // Continue main thread // SemaphoreObj.Release(); //} protected virtual /*async Task<*/ IEnumerable <CrawlResult> /*>*/ GetSitemapResults(IRobotsSitemap sitemap, CancellationTokenSource cancellationTokenSource) { List <CrawlResult> results = new List <CrawlResult>(); if (!sitemap.IsLoaded) { sitemap = SitemapLoader.Load(sitemap); } if (sitemap.Sitemaps != null && sitemap.Sitemaps.Any()) { Logger.InfoFormat("Sitemap: {0} | Inner sitemaps' count: {1}", sitemap.Location, sitemap.Sitemaps.Count()); foreach (IRobotsSitemap derivedSitemap in sitemap.Sitemaps) { results.AddRange(/*await*/ GetSitemapResults(derivedSitemap, cancellationTokenSource) /*.Result*/); } } if (sitemap.Items != null && sitemap.Items.Any()) { Logger.InfoFormat("Sitemap: {0} | Uris' count: {1}", sitemap.Location, sitemap.Items.Count()); CrawlContext.Scheduler.Add(sitemap.Items.Select(x => new PageToCrawl(x.Location))); CrawlResult crawlResult = new CrawlResult(); CrawlComplete = false; //await Task.Run(() => ParallelCrawlSite(crawlResult)); ParallelCrawlSite(crawlResult); results.Add(crawlResult); } return(results); }
/// <summary> /// Crawl and parse data from a URL. /// </summary> /// <param name="url">Source URL.</param> /// <returns>Parse result.</returns> public ParseResult ParseFromUrl(string url) { if (String.IsNullOrEmpty(url)) { throw new ArgumentNullException(nameof(url)); } ParseResult ret = new ParseResult(); ret.Xml = new ParseResult.XmlParseResult(); HttpCrawler crawler = new HttpCrawler(url); CrawlResult cr = crawler.Get(); if (!cr.Success) { ret.Time.End = DateTime.UtcNow; return(ret); } byte[] sourceData = cr.Data; string sourceContent = Encoding.UTF8.GetString(sourceData); return(ProcessSourceContent(sourceContent)); }
static void Main(string[] args) { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe. // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe. // Make sure to dispose this instance or you will have a zombie process! IWebDriver driver = CreatePhantomJsDriver(config); // Create the content extractor that uses PhantomJS. IWebContentExtractor extractor = new JavaScriptContentExtractor(driver); // Create a PageRequester that will use the extractor. IPageRequester requester = new PageRequester(config, extractor); using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) { crawler.PageCrawlCompleted += OnPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } } Console.Read(); }
public static void Main(string[] args) { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlCompletedAsync += Crawler_ProcessPageCrawlCompleted; var start = DateTime.Now; var uri = new Uri("https://lord.technology"); CrawlResult result = crawler.Crawl(uri); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } var finish = DateTime.Now; Console.WriteLine((finish - start).TotalMinutes); using (FileStream fs = File.Open(@"./crawl.json", FileMode.Create)) using (StreamWriter sw = new StreamWriter(fs)) using (JsonWriter jw = new JsonTextWriter(sw)) { jw.Formatting = Formatting.Indented; JsonSerializer serializer = new JsonSerializer(); serializer.Serialize(jw, new { nodes = _pages, edges = _relationships }); } }
public static void StartCrawlEbuyer(string url) { try { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; TimeSpan ts = new TimeSpan(0, 0, 5); CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts); CrawlResult result = crawler.Crawl(new Uri(url), cancellationTokenSource); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }catch (Exception) { } ExtractingHtml.ExtractDetailsEbuyer(); }
//Crawling code for GSM public static void StartCrawlGSM(string url) { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStartingGSM; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedGSM; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowedGSM; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowedGSM; TimeSpan ts = new TimeSpan(0, 0, 0); CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts); CrawlResult result = crawler.Crawl(new Uri(url), cancellationTokenSource); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } //FileStream fs = new FileStream("url.txt", FileMode.Open); //StreamReader sr = new StreamReader(fs); //string str = ""; //while ((str = sr.ReadLine()) != null) //{ // StartCrawl(str); //} ExtractingHtml.ExtractingDetailsGSM(); }
public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements) { Directory.CreateDirectory(@"c:\temp\WebScraper"); var result = new List <object>(); using (var client = new HttpClient()) { foreach (var element in elements.Where(e => e.TagName.Equals("a", StringComparison.OrdinalIgnoreCase))) { var href = new Uri(crawlResult.RequestUrl, new Uri(element.Attributes["href"].Value, UriKind.RelativeOrAbsolute)); var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(href.LocalPath)); if (File.Exists(fileName) == false) { Console.WriteLine($"Downloading {href} to {fileName}"); using (var f = File.OpenWrite(fileName)) using (var s = client.GetStreamAsync(href).Result) { s.CopyTo(f); } } else { Console.WriteLine($"Skipping download of {href} to {fileName}"); } result.Add(new { FileName = fileName, Title = element.TextContent }); } return(result); } }
private void Start() { _logger.Info("Starting..."); var url = this.UriTextBox.Text.Trim(); if (_threadManager == null) { _threadManager = new TaskThreadManager(10); var crawler = CreateCrawler(_threadManager); _crawlerCancellationTS = new CancellationTokenSource(); _products.Clear(); _crawlerTask = new Task(() => { CrawlResult result = crawler.Crawl(new Uri(url), _crawlerCancellationTS); OnCrawlerCompleted(); }, _crawlerCancellationTS.Token); _crawlerTask.Start(); } else { _threadManager.Resume(); } NotifyUIOnStatusChange(true); }
public FeedFinishedInfo(DbObjectStandardFeed feed, int index, int count, CrawlResult result) { this.Feed = feed; this.Index = index; this.Count = count; this.Result = result; }