Esempio n. 1
0
        private static void ExtractNames(StreamWriter writer, string firstSecondName, string relativeLink)
        {
            var logPrefix = string.Format("[{0}][{1}]", firstSecondName[0], firstSecondName[1]);

            Console.WriteLine("=======================================================================", logPrefix);

            var namesUrl = "http://renlifang.msra.cn/" + relativeLink;

            Console.WriteLine("{0} Loading names from {1}...", logPrefix, namesUrl);
            WebRobot robot       = new WebRobot();
            var      nameRawHtml = robot.Get(namesUrl);

            var matches = Regex.Matches(nameRawHtml, NamePattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

            foreach (Match match in matches)
            {
                var   nameText  = match.Groups["Name"].Value;
                Match nameMatch = Regex.Match(nameText, @"^(?<Name>[^(]+)\((?<Count>.+)\)$", RegexOptions.Compiled);
                if (nameMatch.Success)
                {
                    var name  = nameMatch.Groups["Name"].Value;
                    var count = nameMatch.Groups["Count"].Value;
                    Console.WriteLine("{0} Name: {1}, Count: {2}", logPrefix, name, count);
                    writer.WriteLine(string.Format(
                                         "{0},{1},{2},{3}", firstSecondName[0], firstSecondName[1], name, count));
                }
                else
                {
                    Console.WriteLine("{0} Invalid name: {1}", logPrefix, nameText);
                }
            }
        }
Esempio n. 2
0
        private static void CrawlNames(StreamWriter writer, char firstNameChar)
        {
            WebRobot robot         = new WebRobot();
            var      secondNameUrl = "http://renlifang.msra.cn/namelist.aspx?f=" + HttpUtility.UrlEncode(firstNameChar.ToString());

            Console.WriteLine("[{0}] Loading from {1}...", firstNameChar, secondNameUrl);
            var response = robot.Get(secondNameUrl);
            // <div class="name-list"><a href="namelist.aspx?f=%e9%98%bf&s=%e9%98%bf">阿阿</a></div>
            var matches = Regex.Matches(response, NamePattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

            foreach (Match match in matches)
            {
                var firstSecondName = match.Groups["Name"].Value;
                var relativeLink    = match.Groups["Link"].Value;
                if (firstSecondName.Length != 2)
                {
                    Console.WriteLine("[{0}] Invalid FirstSecondName: {1}", firstSecondName + firstSecondName);
                    continue;
                }
                ExtractNames(writer, firstSecondName, relativeLink);
            }

            writer.Flush();
        }