private static void CrawlGame(string gid) { string gameId = gid.ToString(); string url = @"http://rivals.yahoo.com/ncaa/basketball/boxscore?gid=" + gameId; if (long.Parse(gameId) > 201103100000) { url += "&old_bs=1"; } Uri boxscoreUrl = new Uri(url); string html = DownloadAndCache(boxscoreUrl); HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); // the date is the first 8 integers of the game id string date = gid.Substring(0, 8); // get the team code values from the summarized table // /html[1]/head[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[1]/div[1]/table[3] // path to the top team code URL HtmlNode awayUrlNode = document.DocumentNode.SelectSingleNode(@"/html[1]/head[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[1]/div[1]/table[3]/tr[1]/td[2]/table[1]/tr[1]/td[1]/table[1]/tr[1]/td[1]/table[1]/tr[4]/td[2]"); HtmlNode homeUrlNode = document.DocumentNode.SelectSingleNode(@"/html[1]/head[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[1]/div[1]/table[3]/tr[1]/td[2]/table[1]/tr[1]/td[1]/table[1]/tr[1]/td[1]/table[1]/tr[6]/td[2]"); // get the teamcode from the node string awayTeamCode = GetTeamCodeFromNode(awayUrlNode); string homeTeamCode = GetTeamCodeFromNode(homeUrlNode); HtmlNode awayTable = document.DocumentNode.SelectSingleNode(@"/html[1]/head[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[1]/div[1]/table[5]"); HtmlNode homeTable = document.DocumentNode.SelectSingleNode(@"/html[1]/head[1]/body[1]/div[1]/table[1]/tr[1]/td[1]/table[2]/tr[1]/td[1]/div[1]/table[7]"); // we have both tables and both teams set if (!string.IsNullOrEmpty(awayTeamCode) && !string.IsNullOrEmpty(homeTeamCode) && awayTable != null && homeTable != null) { // create the game object and write to disk GameRow row = new GameRow(); row.date = date; row.gid = gid; row.awayTeam = awayTeamCode; row.homeTeam = homeTeamCode; WriteGameRow(row); // parse the top (away) table ParseScoreTable(awayTable, gid, awayTeamCode); // parse the bottom (home) table ParseScoreTable(homeTable, gid, homeTeamCode); } else { // could not parse this game, log it ErrorLog.WriteLine("Could not parse " + gid); ErrorLog.Flush(); } }
private static void WriteGameRow(GameRow row) { if (GameStream == null) { GameStream = new StreamWriter(Path.Combine(OUTPUT_PATH, "Games.tsv")); } GameStream.WriteLine("{0}\t{1}\t{2}\t{3}", row.gid, row.date, row.homeTeam, row.awayTeam); }