private static void DownloadScorecards(FileStore dataStore, Season season) { CrawlResults crawlResults = GetCrawlResultsForSeason(dataStore, season); if (crawlResults == null) { Log.WarnFormat("Season {0} has not been crawled yet.", season.Name); return; } Log.InfoFormat("Scorecard download started at {0} for season {1}", DateTime.Now.ToShortTimeString(), crawlResults.Season); var matchRecords = crawlResults.Classifications.SelectMany(m => m.Scorecards); Queue<Task> tasks = new Queue<Task>(); foreach (ScorecardDetails md in matchRecords) { if (md.ScorecardAvailable && !string.IsNullOrEmpty(md.ScorecardUrl)) { Log.InfoFormat("Downloading scorecard for {0}", md); tasks.Enqueue(DownloadScorecardAsync(md, dataStore)); } } Task.WaitAll(tasks.ToArray()); SaveCrawlerResults(crawlResults, dataStore); Log.InfoFormat("Scorecard download finished at {0} for season {1}", DateTime.Now.ToShortTimeString(), crawlResults.Season); }
private void ReduceBatting(Season season, FileStore dataStore) { List<BattingRecord> records = dataStore.Load<List<BattingRecord>>(IndividualBattingMap.GenerateId(season.Name)); Batting.Reduce(records, dataStore); }
private static CrawlResults RunCrawler(Season season) { Log.InfoFormat("Crawler started at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name); Spider crawler = new Spider(); CrawlResults results = crawler.Crawl(season); Log.InfoFormat("\n{0}", DumpResults(results)); Log.InfoFormat("Crawler finished at {0}.", DateTime.Now.ToShortTimeString()); return results; }
private static void RecheckSeason(FileStore dataStore, Season season) { CrawlResults existing = GetCrawlResultsForSeason(dataStore, season); if (existing == null) { Log.WarnFormat("Season {0} has not been crawled yet.", season.Name); return; } Spider spider = new Spider(); CrawlResults recheckResults = spider.Recheck(existing); Log.InfoFormat("\n{0}", DumpResults(recheckResults)); Log.InfoFormat("Crawler finished at {0}.", DateTime.Now.ToShortTimeString()); SaveCrawlerResults(recheckResults, dataStore); }
public CrawlResults Crawl(Season season) { Console.Write("Crawling " + season.Name + " "); CrawlResults results = new CrawlResults { Season = season.Name}; CrawlerLinkDetails seasonPage = new CrawlerLinkDetails { SourcePageType = PageType.SeasonList, SourcePageUrl = "http://cricketarchive.com/Archive/Seasons/index.html", DestinationPageType = PageType.LocationList, DestinationUrl = season.Url, LinkText = season.Name }; List<CrawlerLinkDetails> locationLinks = PageCrawler.CrawlLinksPage(seasonPage); List<MatchClassification> classifications = new List<MatchClassification>(); var taskQueue = new Queue<Task<List<MatchClassification>>>(); foreach (var link in locationLinks) { CrawlerLinkDetails l = link; taskQueue.Enqueue(Task<List<MatchClassification>>.Factory.StartNew(() => CrawlLocation(l, season.Name))); } Task.Factory.ContinueWhenAll(taskQueue.ToArray(), completedTasks => { foreach (Task<List<MatchClassification>> task in completedTasks) { if (task.Exception == null) classifications.AddRange(task.Result); else { Log.Error("Unexpected exception", task.Exception); } } }) .Wait(); results.Classifications = classifications; Console.WriteLine(" done."); return results; }
private static void ParseScorecards(FileStore dataStore, Season season) { CrawlResults crawlResults = GetCrawlResultsForSeason(dataStore, season); if (crawlResults == null) { Log.WarnFormat("Season {0} has not been crawled yet.", season.Name); return; } Log.InfoFormat("Scorecard parsing started at {0} for season {1}", DateTime.Now.ToShortTimeString(), crawlResults.Season); var matchRecords = crawlResults.Classifications.SelectMany(m => m.Scorecards); foreach (ScorecardDetails md in matchRecords) { ParseScorecard(dataStore, md); } Log.InfoFormat("Scorecard parsing finished at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name); }
private void RunMapFunctions(FileStore dataStore, Season season) { CrawlResults crawlResults = GetCrawlResultsForSeason(dataStore, season); if (crawlResults == null) { Log.WarnFormat("Season {0} has not been crawled yet.", season.Name); return; } Log.InfoFormat("Map started at {0} for season {1}", DateTime.Now.ToShortTimeString(), crawlResults.Season); var matchRecords = crawlResults.Classifications.SelectMany(m => m.Scorecards); if (matchRecords.Count() == 0) { Log.InfoFormat("No match records found for {0}", season.Name); return; } var battingTasks = new Queue<Task<List<BattingRecord>>>(); var bowlingTasks = new Queue<Task<List<BowlingRecord>>>(); var fieldingTasks = new Queue<Task<List<FieldingRecord>>>(); foreach (ScorecardDetails details in matchRecords) { string id = CricketMatch.GenerateId(details.Season, details.MatchCode); CricketMatch match = dataStore.Load<CricketMatch>(id); if (match != null) { battingTasks.Enqueue(Task<List<BattingRecord>>.Factory.StartNew(() => IndividualBattingMap.Run(match))); bowlingTasks.Enqueue(Task<List<BowlingRecord>>.Factory.StartNew(() => IndividualBowlingMap.Run(match))); fieldingTasks.Enqueue(Task<List<FieldingRecord>>.Factory.StartNew(() => IndividualFieldingMap.Run(match))); } } List<BattingRecord> battingRecords = new List<BattingRecord>(); List<BowlingRecord> bowlingRecords = new List<BowlingRecord>(); List<FieldingRecord> fieldingRecords = new List<FieldingRecord>(); Task[] continuations = new[] { Task.Factory.ContinueWhenAll(battingTasks.ToArray(), completedTasks => { foreach (var task in completedTasks) { if (task.Exception == null) battingRecords.AddRange(task.Result); else Log.Error("Unexpected exception", task.Exception); } }), Task.Factory.ContinueWhenAll(bowlingTasks.ToArray(), completedTasks => { foreach (var task in completedTasks) { if (task.Exception == null) bowlingRecords.AddRange(task.Result); else Log.Error("Unexpected exception", task.Exception); } }), Task.Factory.ContinueWhenAll(fieldingTasks.ToArray(), completedTasks => { foreach (var task in completedTasks) { if (task.Exception == null) fieldingRecords.AddRange(task.Result); else Log.Error("Unexpected exception", task.Exception); } })}; Task.WaitAll(continuations); dataStore.Save(battingRecords, IndividualBattingMap.GenerateId(season.Name)); dataStore.Save(bowlingRecords, IndividualBowlingMap.GenerateId(season.Name)); dataStore.Save(fieldingRecords, IndividualFieldingMap.GenerateId(season.Name)); Log.InfoFormat("Scorecard parsing finished at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name); }