public static List <PedigreeHorse> Extract(string source) { var horses = new List <PedigreeHorse>(); PedigreeHorse q_horse = ExtractQueryHorse(source); horses.Add(q_horse); ExtractPedigreeHorses(source, horses); return(horses); }
private static void ScrapePage(RacingPostRacesDataContext db_ppdb, string page, ref string outcome, ref string pqid) { List <PedigreeHorse> horses = PedigreeExtract.Extract(page); var added = new List <string>(); foreach (PedigreeHorse horse in horses.OrderBy(x => x.Generation).ThenByDescending(x => x.Pedigree)) { if (added.Contains(horse.PQId)) { continue; } PQ_Horse pq_horse = db_ppdb.PQ_Horses.FirstOrDefault(x => x.Id == horse.PQId); if (pq_horse == null) { pq_horse = new PQ_Horse(); db_ppdb.PQ_Horses.InsertOnSubmit(pq_horse); pq_horse.Id = horse.PQId; pq_horse.Name = horse.Name; pq_horse.FlatName = FlattenName(horse.Name, false); added.Add(horse.PQId); } if (pq_horse.Country == null) { pq_horse.Country = horse.Country; } if (pq_horse.FoalYear == null) { pq_horse.FoalYear = horse.FoalYear; } if (pq_horse.Colour == null) { pq_horse.Colour = horse.Colour; } if (pq_horse.Sex == null) { pq_horse.Sex = horse.Sex; } if (pq_horse.SireId == null) { PedigreeHorse sire1 = horses.FirstOrDefault(x => x.Pedigree == horse.Pedigree + "S"); if (sire1 != null) { pq_horse.SireId = sire1.PQId; } } if (pq_horse.DamId == null) { PedigreeHorse dam1 = horses.FirstOrDefault(x => x.Pedigree == horse.Pedigree + "D"); if (dam1 != null) { pq_horse.DamId = dam1.PQId; } } if (horse.Generation == 0) { pq_horse.Starts = horse.Starts; pq_horse.Wins = horse.Wins; pq_horse.Places = horse.Places; pq_horse.Earnings = horse.Earnings; pq_horse.Owner = horse.Owner; pq_horse.Breeder = horse.Breeder; } Logger.WriteLog(horse.Pedigree + " " + horse.PQId + " " + horse.Name + " " + horse.Country + " " + horse.FoalYear + " " + horse.Colour + " " + horse.Starts + " " + horse.Wins + " " + horse.Places + " " + horse.Earnings + " " + horse.Owner + " " + horse.Breeder); } }
private static void ExtractPedigreeHorses(string source, List <PedigreeHorse> horses) { PedigreeHorse pending = null; int row_num = 0; int rowspan = 0; var regex_table = new Regex("<table.*class=pedigreetable(.*)</table>", RegexOptions.Singleline); Match match_table = regex_table.Match(source); if (match_table.Success) { string table = match_table.Groups[1].ToString(); var regex_row = new Regex("<tr>(.*?)</tr", RegexOptions.Singleline); Match match_row = regex_row.Match(table); while (match_row.Success) { row_num++; string row = match_row.Groups[1].ToString(); // <td colspan=2 rowspan=16 class=m onmousedown="clickMenu('FOOTSTEPSINTHESAND',5,1,event);"><a href=/footstepsinthes var regex_col = new Regex("<td([^>]*?)>(.*?)</td", RegexOptions.Singleline); Match match_col = regex_col.Match(row); while (match_col.Success) { string col_parms = match_col.Groups[1].ToString(); string col = match_col.Groups[2].ToString(); string pqid = ""; string name = ""; string remainder = ""; string country = ""; int year = 0; string colour = ""; if (col_parms.Contains("rowspan")) { var regex_rowspan = new Regex(@"rowspan=(\d+)\s", RegexOptions.Singleline); Match match_rowspan = regex_rowspan.Match(col_parms); if (match_rowspan.Success) { rowspan = Convert.ToInt32(match_rowspan.Groups[1].ToString()); } var horse = new PedigreeHorse(); if (ExtractHorse(col, ref pqid, ref name, ref remainder)) { horse.PQId = pqid; horse.Name = name; if (ExtractCountry(remainder, ref country)) { horse.Country = country; } if (ExtractFoalYear(remainder, ref year)) { horse.FoalYear = year; } if (ExtractColour(remainder, ref colour)) { horse.Colour = colour; } horse.Generation = GetGeneration(rowspan); horse.Pedigree = GetPedigree(row_num, rowspan); horses.Add(horse); } } else { rowspan = 1; if (col.Contains("<a href")) { pending = new PedigreeHorse(); if (ExtractHorse(col, ref pqid, ref name, ref remainder)) { pending.PQId = pqid; pending.Name = name; if (ExtractCountry(remainder, ref country)) { pending.Country = country; } } } else if (col_parms.Contains("class=m") || col_parms.Contains("class=f")) { if (pending != null && pending.PQId != null) { if (ExtractFoalYear(col, ref year)) { pending.FoalYear = year; } if (ExtractColour(col, ref colour)) { pending.Colour = colour; } pending.Generation = GetGeneration(rowspan); pending.Pedigree = GetPedigree(row_num, rowspan); horses.Add(pending); pending = null; } } } match_col = match_col.NextMatch(); } match_row = match_row.NextMatch(); } } }
private static PedigreeHorse ExtractQueryHorse(string source) { string topline = ""; string subtopline = ""; string info = ""; var horse = new PedigreeHorse(); var regex_pqid = new Regex(@"<li><a href=""/(\S+)""[^>]*>Pedigree</a>", RegexOptions.Singleline); Match match_pqid = regex_pqid.Match(source); if (match_pqid.Success) { horse.PQId = match_pqid.Groups[1].ToString().Trim(); } var regex_topline = new Regex(@"<font size='-1' class=normal>(.*?)</font>", RegexOptions.Singleline); Match match_topline = regex_topline.Match(source); if (match_topline.Success) { topline = match_topline.Groups[1].ToString().Trim(); } var regex_name = new Regex( @"<a href=""javascript:nothing\(\);"" class=""nounderline""[^>]*>([^<]*)</a></b>", RegexOptions.Singleline); Match match_name = regex_name.Match(topline); if (match_name.Success) { horse.Name = match_name.Groups[1].ToString().Trim(); } //var regex_subtopline = new Regex(@"</a>(\s*\([A-Z]+\).*)DP =", RegexOptions.Singleline); var regex_subtopline = new Regex(@".*</a>(.*?)DP =", RegexOptions.Singleline); Match match_subtopline = regex_subtopline.Match(topline); if (match_subtopline.Success) { subtopline = match_subtopline.Groups[1].ToString().Trim(); } string country = ""; if (ExtractCountry(subtopline, ref country)) { horse.Country = country; } string colour = ""; if (ExtractColour(subtopline, ref colour)) { horse.Colour = colour; } string sex = ""; if (ExtractSex(subtopline, ref sex)) { horse.Sex = sex; } int year = 0; if (ExtractFoalYear(subtopline, ref year)) { horse.FoalYear = year; } var regex_wins = new Regex(@"(\d+) Starts, (\d+|M) Wins, (\d+) Places", RegexOptions.Singleline); Match match_wins = regex_wins.Match(topline); if (match_wins.Success) { horse.Starts = Convert.ToInt32(match_wins.Groups[1].ToString()); string wins_txt = match_wins.Groups[2].ToString(); if (wins_txt == "M") { horse.Wins = 0; } else { horse.Wins = Convert.ToInt32(wins_txt); } horse.Places = Convert.ToInt32(match_wins.Groups[3].ToString()); } var regex_earnings = new Regex(@"Career Earnings:</b>(.*)$", RegexOptions.Singleline); Match match_earnings = regex_earnings.Match(topline); if (match_earnings.Success) { horse.Earnings = match_earnings.Groups[1].ToString().Trim(); } var regex_info = new Regex(@"<div id=""subjectinfo""(.*?)</div>", RegexOptions.Singleline); Match match_info = regex_info.Match(source); if (match_info.Success) { info = match_info.Groups[1].ToString().Trim(); var regex_owner = new Regex(@"Owner</b>:([^<]*)<", RegexOptions.Singleline); Match match_owner = regex_owner.Match(info); if (match_owner.Success) { horse.Owner = match_owner.Groups[1].ToString().Trim(); } var regex_breeder = new Regex(@"Breeder</b>:([^<]*)<", RegexOptions.Singleline); Match match_breeder = regex_breeder.Match(info); if (match_breeder.Success) { horse.Breeder = match_breeder.Groups[1].ToString().Trim(); } } horse.Generation = 0; horse.Pedigree = ""; return(horse); }