public static void DownloadRaceList(DateTime date, List <int?> raceIds, RacingPostRacesEntities db) { string country; var url = string.Format(@"https://www.racingpost.com/results/{0}/time-order", String.Format("{0:yyyy-MM-dd}", date)); var Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup Browser.AllowMetaRedirect = true; //go to the home page //var PageResult = Browser.NavigateToPage(new Uri(url)); var web = new HtmlWeb(); var doc = web.Load(url); var nodes = doc.QuerySelectorAll("div .rp-timeView__raceInfo").ToList(); // List<HtmlNode> nodes = doc.QuerySelectorAll("div .rp-timeView__buttons > a").ToList(); foreach (var item in nodes) { var courseUrl = baseUrl + item.ChildNodes[1].ChildNodes[1].Attributes["href"].Value; var courseId = Helper.GetIdfromUrl(courseUrl, "https://www.racingpost.com/profile/course/"); string raceUrl = ""; if (item.ChildNodes[3].ChildNodes[1].Attributes.Any(a => a.Name == "href")) { raceUrl = baseUrl + item.ChildNodes[3].ChildNodes[1].Attributes["href"].Value; } else { continue; } int?raceId = Convert.ToInt32(raceUrl.Split('/').LastOrDefault()); if (!raceIds.Any(r => r == raceId)) { //save url to be scraped ScrapeRace scrapeRace = new ScrapeRace(); scrapeRace.Link = raceUrl; scrapeRace.RaceId = raceId; scrapeRace.RaceDate = date; scrapeRace.Scraped = false; scrapeRace.Required = true; scrapeRace.CourseUrl = courseUrl; var course = AllCourses.Where(c => c.Id == courseId).FirstOrDefault(); if (course == null) { RPCourse c = new RPCourse { Id = courseId, Name = courseUrl.Split('/').LastOrDefault().ToUpper() }; db.RPCourses.Add(c); db.SaveChanges(); AllCourses.Add(c); } country = AllCourses.Where(c => c.Id == courseId).FirstOrDefault().Country; scrapeRace.Country = string.IsNullOrEmpty(country) ? "GB" : country; db.ScrapeRaces.Add(scrapeRace); db.SaveChanges(); } } }
private static void SaveHorse(RacingPostRacesEntities db, RootObject result) { Console.Write(string.Format("Downlaod horse {0} name {1} \n", result.profile.horseUid, result.profile.horseName)); RPHorse rpHorse = new RPHorse(); rpHorse.Name = result.profile.horseName; rpHorse.RPId = result.profile.horseUid; rpHorse.Country = result.profile.horseCountryOriginCode; rpHorse.Colour = result.profile.horseColour; rpHorse.Sex = result.profile.horseSexCode; rpHorse.SireId = result.profile.sireUid; rpHorse.DamId = result.profile.damUid; rpHorse.FoalYear = Convert.ToDateTime(result.profile.horseDateOfBirth).Year; if (rpHorse.FoalYear > 1) { rpHorse.FoalDate = Convert.ToDateTime(result.profile.horseDateOfBirth); } else { rpHorse.FoalDate = Convert.ToDateTime("1/1/1753"); } rpHorse.PostTemplate = true; db.RPHorses.Add(rpHorse); db.SaveChanges(); }
public static void SaveRunner(Race race) { using (RacingPostRacesEntities db = new RacingPostRacesEntities()) { int?prevPos = 0; foreach (var runner in race.Runners) { RPRunner rpRunner = new RPRunner(); rpRunner.HorseId = runner.HorseId; rpRunner.RaceId = race.Id; if (DidNotFinsh.Any(df => df.Equals(runner.PosTemp))) { rpRunner.DidNotFinish = runner.PosTemp; rpRunner.Position = prevPos + 1; prevPos = rpRunner.Position; } else { rpRunner.Position = Convert.ToInt32(runner.PosTemp); prevPos = rpRunner.Position; } rpRunner.Status = "Runner"; rpRunner.Draw = string.IsNullOrEmpty(runner.Draw) ? 0 : Convert.ToInt32(runner.Draw); rpRunner.Distance = runner.Distance; // rpRunner.DistBeaten = Convert.ToDouble(runner.DistBeaten); rpRunner.Price = runner.SP; rpRunner.WeightRaw = runner.WeightRaw; rpRunner.Age = Convert.ToInt32(runner.Age); if (db.Jockeys.FirstOrDefault(j => j.Id == runner.JockeyId) == null) { db.Jockeys.Add(new Jockey { Id = runner.JockeyId, Name = runner.Jockey }); } if (db.Trainers.FirstOrDefault(j => j.Id == runner.TrainerId) == null) { db.Trainers.Add(new Trainer { Id = runner.TrainerId, Name = runner.Trainer }); } rpRunner.JockeyId = runner.JockeyId; rpRunner.TrainerId = runner.TrainerId; rpRunner.PostTemplate = true; SaveHorse(new Horse { Id = (int)rpRunner.HorseId }); db.RPRunners.Add(rpRunner); db.SaveChanges(); } } }
static void HorseScrape(string[] args) { List <int> horseIds = new List <int>(); RPHorse rpHorse; using (StreamReader file = File.OpenText(@"C:\Users\MuhammadZubair\Documents\BELData\horse(Feb)(2).json")) using (JsonTextReader reader = new JsonTextReader(file)) { JObject o2 = (JObject)JToken.ReadFrom(reader); JsonSerializer serializer = new JsonSerializer(); //var aa= JsonConvert.DeserializeObject<RPData>(text); var rpHorses = o2.ToObject <RPHorses>();//.Deserialize<RPData>(reader); Cleaner.GetBaseData(); // Cleaner.GetFooter(); int raceCount = 1; using (RacingPostRacesEntities db = new RacingPostRacesEntities()) { foreach (var item in rpHorses.Horses) { HorseCleaning.PrcessHorse(item); HorseCleaning.PrcessHeader(item); raceCount++; rpHorse = new RPHorse(); rpHorse.Name = item.Name; rpHorse.RPId = item.Id; rpHorse.Country = item.Country; rpHorse.Colour = item.Color; rpHorse.Sex = item.Sex; rpHorse.SireId = item.SireId; rpHorse.DamId = item.DamId; rpHorse.FoalDate = item.DOB; rpHorse.FoalYear = item.DOB.Year; rpHorse.PostTemplate = true; db.RPHorses.Add(rpHorse); } db.SaveChanges(); } //horseIds = horseIds.Distinct().ToList(); //foreach (var item in horseIds) //{ // System.Diagnostics.Trace.Write(item + " ,\n"); //} // Cleaner.ProcessClass(rpData.AllRaces[20]);// "(Class 4) | (4yo+) (2m5f82y)| | 2m5½f Heavy 10 hdles 1 omitted"); } }
public static void SaveRace(Race race) { if (race.Id == 0) { throw new ArgumentNullException(); } using (RacingPostRacesEntities db = new RacingPostRacesEntities()) { if (db.RPRaces.FirstOrDefault(r => r.Id == race.Id) == null) { RPRace rpRace = new RPRace(); rpRace.Id = race.Id; rpRace.CourseId = race.CourseId; rpRace.StartTime = race.StartTime; rpRace.Name = race.Name; rpRace.RaceType = race.RaceType; rpRace.Handicap = race.Handicap; rpRace.Chase = race.Chase; rpRace.Fences = race.Fences; rpRace.FencesOmitted = race.FencesOmitted; rpRace.FencesHurdles = race.FencesHurdles; rpRace.ClassRaw = race.ClassRaw; rpRace.Class = race.Class; rpRace.GradeGroup = race.GradeGroup; rpRace.Rating = race.Rating; rpRace.Eligibility = race.Eligibility; rpRace.DistanceYards = race.DistanceYards; rpRace.DistanceStd = race.DistanceStd; rpRace.Distance = race.Distance; rpRace.Going = race.Going; rpRace.PrizeMoney = race.PrizeMoney; rpRace.Prize1st = race.Prize1st; rpRace.Prize2nd = race.Prize2nd; rpRace.Prize3rd = race.Prize3rd; rpRace.Prize4th = race.Prize4th; rpRace.Prize5th = race.Prize5th; rpRace.Prize6th = race.Prize6th; rpRace.CurrencyUnit = race.CurrencyUnit; rpRace.Runners = Convert.ToInt32(race.NoOfRunners); rpRace.Time = race.WinTime; rpRace.NonRunners = race.NonRunners; rpRace.PostTemplate = true; db.RPRaces.Add(rpRace); SaveRunner(race); db.SaveChanges(); } } }
public static void ScrapeRace() { using (RacingPostRacesEntities db = new RacingPostRacesEntities()) { var startDate = db.ScrapeCourses.FirstOrDefault().LastDateScraped; //Check current date has all link downlaoded var alreadyDownloaded = db.ScrapeRaces.Select(s => s.RaceId).ToList(); Courses = db.RPCourses.ToList(); //scrape page start var linksToDownload = db.ScrapeRaces.Where(link => link.Required == true && link.Scraped == false).OrderBy(d => d.RaceDate).ToList(); foreach (var url in linksToDownload) { Thread.Sleep(1000); int retry = 0; while (retry <= 3) { using (System.Data.Entity.DbContextTransaction dbTran = db.Database.BeginTransaction()) { try { Console.Write(string.Format("Downloading race for {0} - {1}\n", url.RaceDate, url.RaceId)); DownloadSingleRace(url.Link, db); url.Scraped = true; db.SaveChanges(); dbTran.Commit(); break; } catch (Exception ex) { dbTran.Rollback(); retry++; //throw; } } } } } }
public static void ScrapeRaceList(DateTime endDate) { using (RacingPostRacesEntities db = new RacingPostRacesEntities()) { var startDate = db.ScrapeCourses.FirstOrDefault().LastDateScraped; //Check current date has all link downlaoded var alreadyDownloaded = db.ScrapeRaces.Select(s => s.RaceId).ToList(); AllCourses = db.RPCourses.ToList(); //scrape page start while (startDate <= endDate) { Thread.Sleep(2000); Console.Write(string.Format("Scrape links for date {0} \n", startDate)); DownloadRaceList(startDate, alreadyDownloaded, db); db.ScrapeCourses.FirstOrDefault().LastDateScraped = startDate; db.SaveChanges(); startDate = startDate.AddDays(1); } } }
public static void ProcessRunner(RPRace race, HtmlDocument doc, RacingPostRacesEntities db) { RPRunner runner; var rows = doc.QuerySelectorAll("tbody").FirstOrDefault().ChildNodes.Where(c => c.Name == "tr").ToList().Where(row => row.Attributes[0].Value == "rp-horseTable__mainRow").ToList(); foreach (var item in rows) { runner = new RPRunner(); runner.Status = "Runner"; runner.PostTemplate = true; runner.RaceId = race.Id; var posTemp = item.QuerySelectorAll("div .rp-horseTable__pos").FirstOrDefault().ChildNodes[3].ChildNodes[1].ChildNodes[0].InnerHtml.Replace("\n", "").Trim(); if (DidNotFinsh.Any(df => df.Equals(posTemp))) { runner.DidNotFinish = posTemp; } else { runner.Position = Convert.ToInt32(item.QuerySelectorAll("div .rp-horseTable__pos").FirstOrDefault().ChildNodes[3].ChildNodes[1].ChildNodes[0].InnerHtml.Replace("\n", "").Trim()); } string draw = item.QuerySelectorAll("div .rp-horseTable__pos").FirstOrDefault().ChildNodes[3].ChildNodes[1].ChildNodes[2].InnerHtml.Replace(" (", "").Replace(")", "").Trim(); int DrawPos; int.TryParse(draw, out DrawPos); runner.Draw = DrawPos > 0 ? Convert.ToInt32(DrawPos) : (int?)null; var lendthAttr = item.QuerySelectorAll("span .rp-horseTable__pos__length").FirstOrDefault(); if (lendthAttr.ChildNodes.Count >= 2) { runner.Distance = lendthAttr.ChildNodes[1].InnerHtml; } if (lendthAttr.ChildNodes.Count >= 4) { // var dis = FractionToDouble(lendthAttr.ChildNodes[3].InnerHtml.Replace("[", "").Replace("]", "")); } var horseUrl = item.QuerySelectorAll("a .rp-horseTable__horse__name").FirstOrDefault().Attributes[0].Value; runner.HorseId = Helper.GetIdfromUrl(horseUrl, "/profile/horse/"); runner.Price = item.QuerySelectorAll("span .rp-horseTable__horse__price").FirstOrDefault().InnerHtml.Replace("\n", "").Trim(); var persons = item.QuerySelectorAll("span .rp-horseTable__human__wrapper"); //jockey info var jockeyUrl = persons.FirstOrDefault().ChildNodes[1]; runner.JockeyId = Helper.GetIdfromUrl(jockeyUrl.Attributes[0].Value, "/profile/jockey/"); Jockey jockey = new Jockey(); jockey.Id = Convert.ToInt32(runner.JockeyId); jockey.Name = jockeyUrl.InnerHtml.Replace("\n", "").Trim(); jockey.Name = jockey.Name.Substring(0, jockey.Name.IndexOf("<")); if (!db.Jockeys.Where(j => j.Id == jockey.Id).Any()) { db.Jockeys.Add(jockey); db.SaveChanges(); } //trainer info var trainerUrl = persons[1].ChildNodes[1]; runner.TrainerId = Helper.GetIdfromUrl(trainerUrl.Attributes[0].Value, "/profile/trainer/"); Trainer trainer = new Trainer(); trainer.Id = Convert.ToInt32(runner.TrainerId); trainer.Name = trainerUrl.InnerHtml.Replace("\n", "").Trim(); if (trainer.Name.IndexOf("<") > 0) { trainer.Name = trainer.Name.Substring(0, trainer.Name.IndexOf("<")); } if (!db.Trainers.Where(j => j.Id == trainer.Id).Any()) { db.Trainers.Add(trainer); db.SaveChanges(); } var age = item.ChildNodes[7].InnerHtml.Replace("\n", "").Trim(); if (age.IndexOf("<") > 0) { age = age.Substring(0, age.IndexOf("<")); } runner.Age = Convert.ToInt32(age); var wt = item.ChildNodes[9].InnerHtml.Replace("\n", "").Trim(); runner.WeightRaw = ProcessWt(wt); db.RPRunners.Add(runner); } }
public static void DownloadSingleRace(string url, RacingPostRacesEntities db) { int raceId = Convert.ToInt32(url.Split('/').LastOrDefault()); if (db.RPRaces.Any(r => r.Id == raceId)) { return; } var Browser = new ScrapingBrowser(); Browser.AllowAutoRedirect = true; // Browser has many settings you can access in setup Browser.AllowMetaRedirect = true; //go to the home page //var PageResult = Browser.NavigateToPage(new Uri(url)); var web = new HtmlWeb(); var doc = web.Load(url); //Extract Header RPRace race = new RPRace(); race.Id = raceId; race.StartTime = Convert.ToDateTime(doc.QuerySelectorAll("span .rp-raceTimeCourseName__date").FirstOrDefault().InnerHtml); race.Time = doc.QuerySelectorAll("span .rp-raceTimeCourseName__time").FirstOrDefault().InnerHtml; race.Name = doc.QuerySelectorAll("h2 .rp-raceTimeCourseName__title").FirstOrDefault().InnerHtml; race.CourseId = Convert.ToInt32(url.Split('/')[4]); race.PostTemplate = true; if (doc.QuerySelectorAll("span .rp-raceTimeCourseName_class").FirstOrDefault() != null) { string classDesc = doc.QuerySelectorAll("span .rp-raceTimeCourseName_class").FirstOrDefault().InnerHtml; classDesc = classDesc.Replace("(Class ", "").Replace(")", "").Replace("\n", "").Trim(); race.Class = Convert.ToInt32(classDesc); } if (doc.QuerySelectorAll("span .rp-raceTimeCourseName_ratingBandAndAgesAllowed").FirstOrDefault() != null) { string ageAllowed = doc.QuerySelectorAll("span .rp-raceTimeCourseName_ratingBandAndAgesAllowed").FirstOrDefault().InnerHtml; ageAllowed = ageAllowed.Replace("\n", "").Replace(")", "").Replace("(", "").Trim();//.Split(',')[1]; race.Eligibility = ageAllowed; } race.Distance = doc.QuerySelectorAll("span .rp-raceTimeCourseName_distance").FirstOrDefault().InnerHtml.Replace("\n", "").Trim(); var classRaw = doc.QuerySelectorAll("span .rp-raceTimeCourseName__info_container").FirstOrDefault(); classRaw.ChildNodes.ToList().ForEach(c => { if (c.Name.Equals("span")) { race.ClassRaw = race.ClassRaw + c.InnerHtml.Replace("\n", "").Trim() + " | "; } if (c.Name.Equals("div")) { c.ChildNodes.ToList().ForEach(p => { race.PrizeMoney = race.PrizeMoney + p.InnerHtml.Replace("\n", "").Trim().Replace("£", "£") + " "; }); race.PrizeMoney = race.PrizeMoney.Trim(); } }); race.Notes = doc.QuerySelectorAll("div .rp-raceInfo").FirstOrDefault().InnerHtml; ProcessRaceAttributes(race); ProcessFooter(race); ProcessRunner(race, doc, db); db.RPRaces.Add(race); db.SaveChanges(); }