private static void ExtractNames(StreamWriter writer, string firstSecondName, string relativeLink) { var logPrefix = string.Format("[{0}][{1}]", firstSecondName[0], firstSecondName[1]); Console.WriteLine("=======================================================================", logPrefix); var namesUrl = "http://renlifang.msra.cn/" + relativeLink; Console.WriteLine("{0} Loading names from {1}...", logPrefix, namesUrl); WebRobot robot = new WebRobot(); var nameRawHtml = robot.Get(namesUrl); var matches = Regex.Matches(nameRawHtml, NamePattern, RegexOptions.Compiled | RegexOptions.IgnoreCase); foreach (Match match in matches) { var nameText = match.Groups["Name"].Value; Match nameMatch = Regex.Match(nameText, @"^(?<Name>[^(]+)\((?<Count>.+)\)$", RegexOptions.Compiled); if (nameMatch.Success) { var name = nameMatch.Groups["Name"].Value; var count = nameMatch.Groups["Count"].Value; Console.WriteLine("{0} Name: {1}, Count: {2}", logPrefix, name, count); writer.WriteLine(string.Format( "{0},{1},{2},{3}", firstSecondName[0], firstSecondName[1], name, count)); } else { Console.WriteLine("{0} Invalid name: {1}", logPrefix, nameText); } } }
private static void CrawlNames(StreamWriter writer, char firstNameChar) { WebRobot robot = new WebRobot(); var secondNameUrl = "http://renlifang.msra.cn/namelist.aspx?f=" + HttpUtility.UrlEncode(firstNameChar.ToString()); Console.WriteLine("[{0}] Loading from {1}...", firstNameChar, secondNameUrl); var response = robot.Get(secondNameUrl); // <div class="name-list"><a href="namelist.aspx?f=%e9%98%bf&s=%e9%98%bf">阿阿</a></div> var matches = Regex.Matches(response, NamePattern, RegexOptions.Compiled | RegexOptions.IgnoreCase); foreach (Match match in matches) { var firstSecondName = match.Groups["Name"].Value; var relativeLink = match.Groups["Link"].Value; if (firstSecondName.Length != 2) { Console.WriteLine("[{0}] Invalid FirstSecondName: {1}", firstSecondName + firstSecondName); continue; } ExtractNames(writer, firstSecondName, relativeLink); } writer.Flush(); }