private async void btnStart_Click(object sender, EventArgs e) { pbMain.Value = 0; ToggleGroupBoxes(false); TitleBuilder.Build(this, "Initializing"); try { var threadInfo = new ThreadInfo(txtThread.Text); var scraper = new Scraper(threadInfo, UpdateProgress); var files = await scraper.CollectFilePosts(cbWEBM.Checked, cbDuplicates.Checked); pbMain.Maximum = files.Count; await Task.WhenAll(files.Select(p => scraper.DownloadAsync(p, rbUID.Checked, txtPath.Text, this))); TitleBuilder.Build(this, "Completed", false); } catch (Exception ex) { TitleBuilder.Build(this, ex.Message, false); } ToggleGroupBoxes(true); }
private static string GetPeriodData(ref Scraper.Scraper scraper, string userName, string period) { var url = String.Format("http://runkeeper.com/activitiesByDateRange?userName={0}&userName={0}&startDate={1}", userName, period); //var url = "http://runkeeper.com/activitiesByDateRange?userName="******"&startDate=" + period; var lastContent = scraper.PerformRequest(url, null); return lastContent; }
private static string Login(ref Scraper.Scraper scraper, string username, string password) { var lastContent = scraper.PerformRequest(Constants.RunkeeperConstants.URL_START); var postdata = "_eventName=submit&email=" + username + "&password=" + password; lastContent = scraper.PerformRequest(Constants.RunkeeperConstants.URL_LOGIN, postdata); return lastContent; }
private static void Main(string[] args) { _boligScraper = new Scraper(); _userPreference = GetUserPreference(); _boligPortalRequest = new BoligPortalRequest { Amt = ((int) _userPreference.Region).ToString(), RentMin = "0", RentMax = _userPreference.RentMax, ZipCodes = _userPreference.ZipCodes, ApartmentType = _userPreference.ApartmentTypes, RentLength = new List<string> {"4"}, Page = "1", Limit = "15", SortCol = "3", SortDesc = "1" }; Console.WriteLine("{0} :: Creating infinite loop\n", DateTime.Now); // infinite loop while (true) { Tick(); Thread.Sleep(120000); // 2 minutes } }
private async void btnStart_Click(object sender, EventArgs e) { pbMain.Value = 0; var parser = new Parser(txtThread.Text); if (!parser.IsValid()) return; ToggleGroupBoxes(false); TitleBuilder.Build(this, "Initializing"); var threadData = await parser.BuildThreadData(); var scraper = new Scraper(threadData, UpdateProgress); var files = await scraper.CollectFileURLs(cbWEBM.Checked, cbDuplicates.Checked); if (files == null) return; pbMain.Maximum = files.Count; await Task.WhenAll(files.Select(p => scraper.DownloadFileAsync(this, p, rbUID.Checked, txtPath.Text))); ToggleGroupBoxes(true); TitleBuilder.Build(this, "Completed", false); }
public void SetUp() { _boligScraper = new Scraper(); // TODO: Add mock data for being able to test in offline mode // e.g. a new local .json file }
public void TestFixtureSetUp() { list = new List<HtmlDoc>(); var httpClient = new HttpClient { UserAgentName = "goodbot" }; var scraper = new Scraper(httpClient, new ScrapedUrisDictionary()); scraper.Subscribe(new ConsoleWriterObserver()); scraper.Subscribe(list.Add); scraper.Scrape(new Uri("http://localhost:12345")).Wait(); }
public void Then_images_should_be_saved() { var scraper = new Scraper(); var io = new ImageScraperObserver(new FileWriter(new DirectoryInfo("c:\\temp"))); scraper.Subscribe(io); scraper.Subscribe(new ConsoleWriterObserver()); //scraper.Subscribe(x => Console.WriteLine(x.Uri)); scraper.Scrape(new Uri("http://www.cambridgecupcakes.com/")); }
private void Scrape_Click(object sender, EventArgs e) { var url = urlBox.Text; AddLog(string.Format("Scrapes url:'{0}'", url)); var scraper = new Scraper(); var lastContent = scraper.PerformRequest(url); AddLog(string.Format("Scraped:'{0}', got:",url)); AddLog(lastContent); }
public void Then_no_results_should_be_returned() { var list = new List<HtmlDoc>(); var httpClient = new HttpClient { UserAgentName = "badbot" }; var scraper = new Scraper(httpClient, new ScrapedUrisDictionary()); scraper.Subscribe(new ConsoleWriterObserver()); scraper.Subscribe(list.Add); scraper.Scrape(new Uri("http://localhost:12345")).Wait(); list.Should().BeEmpty(); }
private async Task FetchRandomThumbs() { if (!Scraper.IsInitiated) { Scraper.InstanciateAllDerivedTypes(); } var tasks = Scraper.InstanciatedScrapers.Select(FetchDataAsync).ToList(); //wait all tasks to complete in order to shuffle the stack foreach (var result in await Task.WhenAll(tasks).ConfigureAwait(false)) { if (result == null) { continue; //skip this scraper } foreach (var item in result) { _pictureDataStack.Push(item); } } _pictureDataStack.Shuffle(); RaiseDownloadRandomPagesCompleted(); }
private void OpenBagHook_CL(Action <Player> callback, Player plr) { Action <ScrapedSentData> listener = (d) => { int itemWho = d.Number; Item item = Main.item[itemWho]; if (item?.active != true) { return; } this.ProcessBagItem(plr, item); }; // Scraper.IsScrapingSentData = true; Scraper.AddSendDataListener(listener); callback.Invoke(plr); Scraper.IsScrapingSentData = false; Scraper.RemoveSendDataListener(listener); }
public SchedulesController() { var systemTime = new SystemTime(); var httpClient = new HttpClient(); var client = new Client(httpClient); var scraper = new Scraper(systemTime, client); var serializer = new ScrapeResultSerializer(); var pathBuilder = new PathBuilder(); var storageClient = new StorageClient(systemTime, pathBuilder); var uniqueClient = new UniqueClient(storageClient); var statusRecorder = new UploadStatusRecorder(storageClient, systemTime); _scrapeResultRecorder = new ScrapeResultRecorder(scraper, serializer, storageClient, uniqueClient, statusRecorder); _throttledScrapeResultRecorder = new ThrottledScrapeResultRecorder(systemTime, _scrapeResultRecorder); var gtfsConverter = new GtfsConverter(); var gtfsCsvSerializer = new GtfsCsvSerializer(); var gtfsFeedSerializer = new GtfsFeedSerializer(gtfsCsvSerializer); _gtfsFeedArchiveRecord = new GtfsFeedArchiveRecorder(storageClient, uniqueClient, gtfsConverter, gtfsFeedSerializer, statusRecorder); var settingsService = new SettingsProvider(); _settings = new Settings(settingsService); }
public void ShouldReturnTrueUponCreatingCsv() { Scraper scraper = new Scraper(); var eventList = new List <Event> { new Event { ArtistorEvent = "Billy Soomro's Interview", City = "London", Venue = "SongKick", Time = "12:30pm", Price = "Not specified", Availability = "Available", Date = "Fri 8th Dec 2017", SpecialGuests = "Bring Me The Horizon" } }; var result = scraper.ExportCSV(eventList); Assert.AreEqual(true, result); }
private void runkeeperTest_Click(object sender, EventArgs e) { var scraper = new Scraper(); var urlStart = "https://runkeeper.com"; AddLog(string.Format("Scrapes url:'{0}'", urlStart)); var lastContent = scraper.PerformRequest(urlStart); AddLog(string.Format("Scraped:'{0}', got:", urlStart)); AddLog(lastContent); var urlLogin = "******"; AddLog(string.Format("Scrapes url:'{0}'", urlLogin)); var postdata = "_eventName=submit&email=" + userNameBox.Text + "&password="******"Scraped:'{0}', got:", urlLogin)); AddLog(lastContent); var urlInloggad = "http://runkeeper.com/home"; AddLog(string.Format("Scrapes url:'{0}'", urlInloggad)); lastContent = scraper.PerformRequest(urlInloggad, null); AddLog(string.Format("Scraped:'{0}', got:", urlInloggad)); AddLog(lastContent); }
public async Task GetGamesForAPlayerWithNullPointsCell() { var scraper = new Scraper(_transparentUserAgent); var player = new Player { ID = "heanebr01", FeedUrl = "https://www.basketball-reference.com/players/h/", Name = "Brian Heaney", FirstSeason = 1970, LastSeason = 1970, BirthDate = new DateTime(1946, 9, 3).AsUtc() }; var games = await scraper.GetGames(player, 1970); var regularSeasonGames = games.Where(g => !g.IsPlayoffGame); var playoffGames = games.Where(g => g.IsPlayoffGame); Assert.AreEqual(14, regularSeasonGames.Count()); Assert.AreEqual(6, playoffGames.Count()); Assert.AreEqual(28, regularSeasonGames.Sum(g => g.Points)); Assert.AreEqual(0, playoffGames.Sum(g => g.Points)); }
static void Main(string[] args) { var builder = new ConfigurationBuilder() .SetBasePath(Directory.GetCurrentDirectory()) .AddJsonFile("appsettings.json", optional: true, reloadOnChange: true); IConfigurationRoot configuration = builder.Build(); var services = new ServiceCollection(); services.AddDbContext <DatabaseContext>(options => options.UseMySql(configuration["MySQLConnectionString"]) ); var serviceProvider = services.BuildServiceProvider(); var _context = serviceProvider.GetService <DatabaseContext>(); var scraper = new Scraper(_context); scraper.ScrapeMainList("https://psxdatacenter.com/ulist.html"); scraper.ScrapeMainList("https://psxdatacenter.com/plist.html"); scraper.ScrapeMainList("https://psxdatacenter.com/jlist.html"); }
public async Task GetGamesForAPlayerWhoPlayedInADoubleheader() { var scraper = new Scraper(_transparentUserAgent); var player = new Player { ID = "bemorir01", FeedUrl = "https://www.basketball-reference.com/players/b/", Name = "Irv Bemoras", FirstSeason = 1954, LastSeason = 1957, BirthDate = new DateTime(1930, 11, 18).AsUtc() }; var games = await scraper.GetGames(player, 1954); Assert.AreEqual(68, games.Count); Assert.AreEqual(68, games.Select(g => g.Date).Distinct().Count()); Assert.AreEqual(67, games.Select(g => g.Date.Date).Distinct().Count()); Assert.AreEqual("bemorir01 3/8/1954", games[64].ID); Assert.AreEqual("bemorir01 3/8/1954 2", games[65].ID); Assert.AreEqual(9, games[64].Points); Assert.AreEqual(13, games[65].Points); Assert.AreEqual(games[64].Date.AddHours(3), games[65].Date); Assert.AreEqual(506, games.Sum(g => g.Points)); }
public async Task GetGamesForAPlayerWhoPlayedOnTwoDifferentTeamsInTheSameDay() { var scraper = new Scraper(_transparentUserAgent); var player = new Player { ID = "johnsne01", FeedUrl = "https://www.basketball-reference.com/players/j/", Name = "Neil Johnston", FirstSeason = 1952, LastSeason = 1959, BirthDate = new DateTime(1929, 2, 4).AsUtc() }; var games = await scraper.GetGames(player, 1952); var regularSeasonGames = games.Where(g => !g.IsPlayoffGame); var playoffGames = games.Where(g => g.IsPlayoffGame); Assert.AreEqual(65, regularSeasonGames.Count()); Assert.AreEqual(3, playoffGames.Count()); Assert.AreEqual(64, regularSeasonGames.Where(g => g.Team == "PHW").Count()); Assert.AreEqual(1, regularSeasonGames.Where(g => g.Team == "SYR").Count()); Assert.AreEqual(3, playoffGames.Where(g => g.Team == "PHW").Count()); }
public async Task Run() { var oldRequestResult = await Scraper.GetEntries(); do { var newRequestResult = await Scraper.GetEntries(); var newEntries = GetNewEntries(oldRequestResult, newRequestResult); foreach (var entry in newEntries) { await TelegramClient .SendMessage(new MessageRequest { Text = $"{entry.Title} - {entry.Price} - {entry.Time}" }); } oldRequestResult = newRequestResult; Thread.Sleep(10000); }while (true); }
private void btnSearch_Click(object sender, EventArgs e) { string keywords = txtSearchBox.Text.Trim(); try { if (keywords.Length > 0) { lviewResults.Items.Clear(); _scraper = new Scraper("vuighe.net"); _listItems = _scraper.GetSearchResults(keywords); GenerateItemsListView(); } else { MessageBox.Show(@"Please input the keywords"); } } catch (ArgumentNullException ex) { MessageBox.Show(@"Cannot find result with keywork '" + keywords + "'"); // MessageBox.Show(ex.StackTrace); } }
#pragma warning disable 1998 public async override global::System.Threading.Tasks.Task ExecuteAsync() { #line 1 "C:\Users\t-depra\Desktop\ScraperTrial\ScraperTrial\Views\Home\Index.cshtml" ViewData["Title"] = "Home Page"; var scrape = new Scraper(); // var file = new ScraperFileStore(); #line default #line hidden BeginContext(121, 2, true); WriteLiteral("\r\n"); EndContext(); BeginContext(124, 17, false); #line 7 "C:\Users\t-depra\Desktop\ScraperTrial\ScraperTrial\Views\Home\Index.cshtml" Write(scrape.Scraping()); #line default #line hidden EndContext(); BeginContext(141, 2, true); WriteLiteral("\r\n"); EndContext(); }
static async Task ListCharacters(WeasylClient client) { var user = await client.WhoamiAsync(); Console.WriteLine(user.login); Console.WriteLine("----------"); var charids = await Scraper.GetCharacterIdsAsync(user.login); foreach (int id in charids) { Console.WriteLine(id); } Console.WriteLine("----------"); foreach (int id in charids.Take(3)) { var details = await client.GetCharacterAsync(id); Console.WriteLine(details.title); Console.WriteLine("Species: " + details.species); Console.WriteLine(); } }
public override List <Dish> ReadWeeklyMenu() { var dishes = new List <Dish>(); var html = Scraper.ScrapeWebPage(Restaurant.KalasetPåFyran.Url); var cq = new CQ(html); var menuDate = DateHelper.MondayThisWeek(); var lunchMenuTags = cq["#main-content p > strong, #main-content li"]; if (lunchMenuTags == null) { return(dishes); } foreach (var tag in lunchMenuTags) { if (tag.NodeName.Equals("strong", StringComparison.OrdinalIgnoreCase)) { menuDate = ParseWeekDay(WebUtility.HtmlDecode(tag.InnerText).Trim()); continue; } var description = WebUtility.HtmlDecode(tag.InnerText).Trim(); /*if (WebUtility.HtmlDecode(description).Trim().Equals("(v) – finns som vegeteriskt alternativ", StringComparison.OrdinalIgnoreCase)) * { * continue; * }*/ var dish = new Dish(description, menuDate, Restaurant.KalasetPåFyran.Id); dishes.Add(dish); } return(dishes); }
public override List <Dish> ReadWeeklyMenu() { var dishes = new List <Dish>(); var html = Scraper.ScrapeWebPage(Restaurant.Aihaya.Url); var cq = new CQ(html); var menuDate = DateHelper.MondayThisWeek(); var lunchMenuTags = cq[".lunch_menu .menu_header, .lunch_menu .td_title"]; if (lunchMenuTags == null) { return(dishes); } foreach (var tag in lunchMenuTags) { if (tag.HasClass("menu_header")) { menuDate = ParseWeekDay(WebUtility.HtmlDecode(tag.InnerText).Trim()); continue; } var description = tag.InnerText; if (WebUtility.HtmlDecode(description).Trim().Equals("(v) – finns som vegeteriskt alternativ", StringComparison.OrdinalIgnoreCase)) { continue; } var dish = new Dish(description, menuDate, Restaurant.Aihaya.Id); dishes.Add(dish); } return(dishes); }
public static DivisaModel GetRofexModel() { var divisas = new List <DivisaViewModel>(); HtmlNode html = new Scraper(new Uri("http://www.rofex.com.ar/"), Encoding.UTF7).GetNodes(); var cierre = html.CssSelect("#cierre_monedas"); var tabla = cierre.CssSelect("table tr").Skip(1); foreach (var htmlNode in tabla) { var tds = htmlNode.CssSelect("td").ToArray(); var nombre = tds[0]; var compra = tds[1]; var venta = tds[1]; var variacion = tds[3]; divisas.Add(new DivisaViewModel { Nombre = nombre.InnerText, Simbolo = "U$S", ValorCompra = compra.InnerText.Remove(compra.InnerText.Length - 1), ValorVenta = venta.InnerText.Remove(venta.InnerText.Length - 1), Variacion = variacion.InnerText, }); } var result = new DivisaModel { Actualizacion = DateTime.Now, Divisas = divisas, }; return(result); }
public string Update(string guid, [FromBody] Scraper s) { IScraper scraper = m_scraperService.GetScraper(guid); if (scraper == null) { var error = new LexicalAnalyzer.Models.Error(); error.Message = "Could not find Scraper with the given GUID"; return(JsonConvert.SerializeObject(error)); } if (s == null) { /* The JSON sent was not in the correct format */ Response.StatusCode = 400; /* Bad Request */ var error = new LexicalAnalyzer.Models.Error(); error.Message = "Invalid structure for Scraper object"; return(JsonConvert.SerializeObject(error)); } scraper.Properties = s.Properties; if (s.Status.ToLower() == "started") { m_scraperService.StartScraper(guid); } else if (s.Status.ToLower() == "paused") { m_scraperService.PauseScraper(guid); } else { Response.StatusCode = 400; /* Bad Request */ var error = new LexicalAnalyzer.Models.Error(); error.Message = "The only valid Scraper status values to set are 'start' or 'pause'"; return(JsonConvert.SerializeObject(error)); } return(JsonConvert.SerializeObject(scraper)); }
private void FillData() { try { foreach (var data in _collection.FindAll()) { //remove invalid data if (!CheckValid(data)) { DeleteById(data.Id); continue; } _cache.Add(new PictureData(Scraper.GetScraperByName(Fussy.DecryptString(data.ScraperName))) { ThumbUrl = Fussy.DecryptString(data.ThumbUrl), PageUrl = Fussy.DecryptString(data.PageUrl) }); } } catch (Exception ex) { ExManager.Ex(ex); } }
public void Init() { htmlDocument = HtmlDocumentFactory.FromPath(Constants.SampleFutureSyncFilePath); scraper = new Scraper(htmlDocument); }
void _downloader_DownloadProgress(object sender, Scraper.Events.DownloadProgressEventArgs e) { ProgressValue = e.PercentComplete; CurrentActionText = e.Message; InvokePropertyChanged("ProgressValue"); InvokePropertyChanged("CurrentActionText"); }
private async Task SearchMobyGames(string term) { var scraper = new Scraper(Application.Current.ScraperWebClient()); var entries = await Task.Run(() => scraper.Search(term)); var disambiguationDialog = new GameDisambiguationDialog(entries) { Owner = editGameViewModel.ParentWindow }; if (disambiguationDialog.ShowDialog() != true) { return; } var url = disambiguationDialog.SelectedResult.Url; if (disambiguationDialog.SelectedResult.Releases.Any()) { // Find the first release that matches our preferred platforms. foreach (var platform in PlatformPriorities) { var matchingRelease = disambiguationDialog.SelectedResult.Releases .FirstOrDefault(release => release.Platform.Equals(platform, StringComparison.OrdinalIgnoreCase)); if (matchingRelease == null || string.IsNullOrEmpty(matchingRelease.Url)) { continue; } url = matchingRelease.Url; } } var gameEntry = scraper.GetGame(url); GetSpecs(gameEntry); GetScreenshots(gameEntry); editGameViewModel.GameScreenshots.Clear(); editGameViewModel.Title = gameEntry.Name; editGameViewModel.GameMobyGamesSlug = gameEntry.Slug; editGameViewModel.GameLinks.Add(gameEntry.Url); var publisher = editGameViewModel.Publishers.ToList().Find(p => p.Slug == gameEntry.Publisher.Slug); if (publisher == null) { publisher = new Publisher { Name = gameEntry.Publisher.Name, Slug = gameEntry.Publisher.Slug, Links = new List <string> { gameEntry.Publisher.Url } }; editGameViewModel.Publishers.Add(publisher); } editGameViewModel.GamePublisher = publisher; var developerCollection = editGameViewModel.Developers.ToList(); foreach (var devEntry in gameEntry.Developers) { var developer = developerCollection.Find(d => d.Slug == devEntry.Slug); if (developer == null) { developer = new Developer { Name = devEntry.Name, Slug = devEntry.Slug, Links = new List <string> { devEntry.Url }, }; editGameViewModel.Developers.Add(developer); } editGameViewModel.GameDevelopers.Add(developer); } }
void rule_RemoveRule(object sender, Scraper.Notifier.Event.RemoveRuleEventArgs e) { e.Rule.RemoveRule -= new EventHandler<Scraper.Notifier.Event.RemoveRuleEventArgs>(rule_RemoveRule); Rules.Remove(e.Rule); }
protected Crawler(IDocumentFactory documentFactory, IKeyValueStore<string, Result> store, IKeyValueStore<string, FetchTarget> frontier) { _store = store; _frontier = frontier; var fetcherOptions = new FetcherOptions { UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36", }; var parserOptions = new ParserOptions { }; var scraperOptions = new ScraperOptions { }; var extractorOptions = new ExtractorOptions { }; //var storerOptions = new StorerOptions //{ //}; var builderOptions = new BuilderOptions { }; var providerOptions = new ProviderOptions { }; //var dispatcherOptions = new DispatcherOptions //{ //}; Fetcher = new Fetcher(fetcherOptions); Parser = new Parser(parserOptions, documentFactory); Scraper = new Scraper(scraperOptions); Extractor = new Extractor(extractorOptions); Storer = new Storer(store); Builder = new Builder(builderOptions); Provider = new Provider(providerOptions, store, frontier); Dispatcher = new Dispatcher(); Fetcher.SendTo(Parser, x => x.StatusCode == System.Net.HttpStatusCode.OK); Parser.SendTo(Scraper); Parser.SendTo(Extractor); Fetcher.SendTo(Builder, x => x.StatusCode == System.Net.HttpStatusCode.OK); Scraper.SendTo(Builder); Extractor.SendTo(Builder); Builder.SendTo(Storer); //Storer.LinkTo(new ActionBlock<Result>(x => //{ //})); Builder.SendTo(Provider); Provider.SendTo(Dispatcher, x => x != null); Dispatcher.SendTo(Fetcher); }
public ActionResult Scrape() { Scraper.RunScraper(); return(RedirectToAction("Recent")); }
private static async Task Main(string[] args) { if (File.Exists(ConfigPath)) { Xml.LoadConfig(ConfigPath); } string argInput; try { argInput = args[0]; } catch { argInput = null; } string dmPath; DirectoryInfo chatDir; PathInput: Console.Clear(); if (argInput == null) { Logger.Print("Path of files to be scraped: ", LogType.Info, false); dmPath = Console.ReadLine(); } else { dmPath = argInput; } if (!Directory.Exists(dmPath)) { Console.Clear(); Logger.Print("Path was not found.", LogType.Error); Thread.Sleep(1000); goto PathInput; } else { chatDir = new DirectoryInfo(dmPath); } Console.Clear(); Logger.Print("Press any key to start downloading...", LogType.Info); Console.Read(); var downloader = new Downloader(); _scraper = new Scraper(downloader, chatDir, dmPath); await _scraper.Execute(); Console.WriteLine(); Logger.Print($"{DateTime.Now} | Finished scraping and downloading all links and files!", LogType.Info); Console.ReadLine(); }
public void Scraper_Ctor_Null_Args() { _ = new Scraper(null); }
void _downloader_DownloadCompleted(object sender, Scraper.Events.DownloadCompletedEventArgs e) { State = DownloadState.Ok; if (e.Cancelled) { CurrentActionText = "Download was cancelled."; State = DownloadState.Warning; } else if (e.Error != null) { CurrentActionText = "Unable to download/save requested chaper"; State = DownloadState.Error; _log.Error("Unable to download/save requested chapter.", e.Error); } Completed = true; InvokePropertyChanged("ProgressValue"); InvokePropertyChanged("CurrentActionText"); InvokePropertyChanged("Completed"); InvokePropertyChanged("CanOpen"); OnDownloadCompleted(); }
static void Main(string[] args) { OrmLiteConfig.DialectProvider = SqliteDialect.Provider; CreateTables(); GetAndCreateRegions(); GetAndCreatePollutants(); foreach (Pollutant pollutant in Globals.PollutantDict.Values) { Console.WriteLine("\nFetching values for Pollutant: " + pollutant.Name); var url = Globals.baseUrl + pollutant.Name; var httpClient = new HttpClient(); var scraper = new Scraper(httpClient, new ScrapedUrisDictionary()); scraper.Subscribe(new PollutantTableObserver(pollutant.Name)); scraper.DisableRobotsProtocol = true; scraper.Scrape(new Uri(url)).Wait(); } }
public static IEnumerable <Result> Scrape(string url, int ageLimitMinutes, int maxResultCount, int scrapeDelayMs) { List <Result> results = new List <Result>(); if (!url.Contains("&SortBy=LastSeen&Order=desc")) { url = $"{url}&SortBy=LastSeen&Order=desc"; } if (Uri.TryCreate(url, UriKind.Absolute, out Uri searchUriAsc)) { var searchResultsNodesXPath = "/html/body/div[2]/table/tbody/tr[3]/td[2]/section/div/table/tbody"; int delay = 0; if (scrapeDelayMs > 0) { int pagesToScrape = maxResultCount / 10; delay = pagesToScrape > 0 ? scrapeDelayMs / pagesToScrape : scrapeDelayMs; } Thread.Sleep(delay); //GetProxies.FromFreeProxyListNet(); var siteScraper = new Scraper(); var docAsc = siteScraper.TryLoadHtmlDocument(searchUriAsc); var searchResultNodesAsc = siteScraper.GetChildNodes(searchResultsNodesXPath); if (searchResultNodesAsc == null) { string captchaXpath = "/html/body/div[2]/table/tbody/tr[3]/td[2]/section/div/form/div/div"; var captchaNode = siteScraper.GetNode(captchaXpath); if (captchaNode != null) { if (PromptUserForCaptcha(url)) { return(Scrape(url, ageLimitMinutes, maxResultCount, scrapeDelayMs)); } } return(results); } List <HtmlNode> nodesToParse = new List <HtmlNode>(); nodesToParse.AddRange(from srn in searchResultNodesAsc where srn.GetClasses().Contains("cursor-pointer") select srn); if (nodesToParse.Count > maxResultCount) { nodesToParse.RemoveRange((nodesToParse.Count / 2) - ((nodesToParse.Count - maxResultCount) / 2), nodesToParse.Count - maxResultCount); } var fieldsWereInterestedIn = GetFieldSet(); foreach (var field in fieldsWereInterestedIn) // remove parent node path from field path { string updatedPath = field.xPath.Replace(searchResultsNodesXPath, ""); field.xPath = updatedPath; } foreach (var node in nodesToParse) { if (node != null) { var parsedNodeFields = Scraper.ParseNodeFields(node, fieldsWereInterestedIn); results.Add(ReadFieldSet(parsedNodeFields)); } } if (results.Count < maxResultCount && results.Count == 10) { if (url.Contains("&SortBy=LastSeen&Order=desc")) { if ((from r in results where DateTime.Now.Subtract(r.LastSeen).TotalMinutes >= ageLimitMinutes select r).Count() > 0) { return(results); } } // check for more pages of results var paginationNodes = siteScraper.GetChildNodes("/html/body/div[2]/table/tbody/tr[3]/td[2]/section/div/div[3]/ul"); foreach (var lineItemNode in paginationNodes) // look at all LIs { if (lineItemNode.HasChildNodes) { foreach (var childNode in lineItemNode.ChildNodes) { if (!childNode.HasClass("disabled") && childNode.InnerText == ">") { if (childNode.Attributes.Contains("href")) { string nextPageUrl = childNode.Attributes["href"].Value.Replace("amp;", ""); results.AddRange(Scrape(nextPageUrl, ageLimitMinutes, maxResultCount - results.Count, scrapeDelayMs - delay)); } } } } } } return(results); } else { throw new ArgumentException($"Could not create Uri from {url}", nameof(url)); } }
static void Main(string[] args) { bool offline = false; for (int i=0; i<args.Length; i++) { if (string.Compare(args[i], "-help", true) == 0) { Console.WriteLine("Options:"); Console.WriteLine("\t-clean\tclear local cache"); Console.WriteLine("\t-help\tshow this message"); Console.WriteLine("\t-o\toffline mode (use cache only)"); return; } if (string.Compare(args[i], "-clean", true) == 0) { new Cache().Clean(); return; } if (string.Compare(args[i], "-o", true) == 0) offline = true; } Cache cache = new Cache(); IOrderLoader loader = null; if (!offline) { Amz.Auth.CookiesFirefox cf = new Auth.CookiesFirefox(Properties.Settings.Default.BaseDomain); if (cf.Count > 0) { Console.WriteLine("Trying Firefox login credentials (" + cf.Count + " cookies)..."); loader = new Scraper(cf); } else Console.Error.WriteLine("Could not log in!"); } else loader = cache; if (loader != null) { try { var years = loader.LoadOverview(Properties.Settings.Default.StartUrl); double total = 0.0; foreach (var n in years) { Console.WriteLine("Loading " + n + "..."); var orders = loader.LoadYear(n, Properties.Settings.Default.HistoryUrlTemplate); if (orders.Count != cache.Store(orders)) loader = cache; double year_total = orders.Aggregate(0.0, (acc, o) => acc + o.Sum); Console.WriteLine("\tTotal: " + year_total); total += year_total; #if DEBUG if (n < 2015) break; #endif } Console.WriteLine("Total: " + total); } catch (Exception exc) { Console.Error.WriteLine(exc.Message); } } cache.Dispose(); #if DEBUG Console.WriteLine("Any key to exit."); Console.ReadKey(); #endif }
public ActionResult Index() { IEnumerable <Headline> headlines = Scraper.GetHeadlines(); return(View(headlines)); }
public void Scrape() { bool covers; bool rescrape; var view = new ConfirmScrapeView(); if (view.ShowDialog() == true) { covers = view._viewModel.GenerateCovers; rescrape = view._viewModel.ReScrape; } else { return; } var scraper = new Scraper(); ProgressReportingActive = true; scraper.BookChanged += MainViewModel.i_BookChanged; scraper.Worker.RunWorkerCompleted += _worker_RunWorkerCompleted; scraper.Scrape(SelectedSourceDirectory, MainViewModel.Books.Cast<Book>().ToList(), covers, rescrape); scraper.ProgressComplete += delegate { ProgressReportingActive = false; }; Refresh(); _library.CleanImages(); }
static void Main(string[] args) { try { Console.WriteLine("Please enter which Udemy course URL that you would like to scrape:"); var udemyCourseURL = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString(udemyCourseURL); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content).Build(); /* * string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategoryName}"); * * ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() * .WithData(content) * .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)") * .WithRegexOption(RegexOptions.ExplicitCapture) * .WithPart(new ScrapeCriteriaPartBuilder() * .WithRegex(@">(.*?</a>") * .WithRegexOption(RegexOptions.Singleline) * .Build()) * .WithPart(new ScrapeCriteriaPartBuilder() * .WithRegex(@"href=\""(.*?)\""") * .WithRegexOption(RegexOptions.Singleline) * .Build()) * .Build(); */ string content = client.DownloadString($"https://www.udemy.com/course/learn-csharp-by-building-applications/"); using (FileStream fileStream = new FileStream("output.html", FileMode.Create)) { using (StreamWriter streamWriter = new StreamWriter(fileStream)) { streamWriter.Write(content); } } ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"/watch\?v=(.*?)~" .Replace('~', '\"')) .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the specified scrape criteria."); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Search() { scraper = new Scraper(); SearchAndOutput(); }
private static string EnterHome(ref Scraper.Scraper scraper) { var lastContent = scraper.PerformRequest(Constants.RunkeeperConstants.URL_HOME, null); return lastContent; }
static void Search(string searchTerm) { scraper = new Scraper(searchTerm); SearchAndOutput(); }
static void Main(string[] args) { try { // Get city and category from user Console.Write("City to scrape information for: "); string city = Console.ReadLine() ?? string.Empty; Console.Write("CraigsList category: "); string category = Console.ReadLine() ?? string.Empty; // Use WebClient to pull web page then scrape the listing URL and descriptions using (WebClient client = new WebClient()) { Console.WriteLine($"Scraping page http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}"); string content = client.DownloadString($"http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) // RegEx for entire listing element .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) // Build scraper for listing description part .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) // Build scraper for listing URL part .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); // Call scraper to extract listing elements from page then extract parts from listing elements Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); // Display scraped parts if any exists if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the entered city and category."); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } finally { Console.Write("Press any key to exit."); Console.ReadLine(); } }
static void Search(string searchTerm, int pageFrom, int pageTo) { scraper = new Scraper(searchTerm, pageFrom, pageTo); SearchAndOutput(); }
void Scrape() { Scraper = _auctionWebScraperFactory.CreateAuctionWebScraper(); Scraper.StartAsync(); }
static void Main(string[] args) { var scraper = new Scraper(); scraper.Run(); }