示例#1
0
        private static List <Academincian> searcheraca1(HashSet <string> acaold)
        {
            /**
             * 搜索中国科学院院士
             */
            List <Academincian> acas = new List <Academincian>();
            HttpWebRequest      request;

            request           = (HttpWebRequest)WebRequest.Create(URL1);
            request.Method    = "POST"; //Post请求方式
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
            HttpWebResponse response;
            string          m_Html = "";
            string          sLine  = "";

            try
            {
                Stream writer = request.GetRequestStream();        //获得请求流
                response = (HttpWebResponse)request.GetResponse(); //获得响应流
                Stream s;
                s = response.GetResponseStream();
                StreamReader objReader = new StreamReader(s, System.Text.Encoding.UTF8);
                int          i         = 0;
                while (sLine != null)
                {
                    i++;
                    sLine = objReader.ReadLine();
                    if (sLine != null)
                    {
                        m_Html += sLine;
                    }
                }
            }
            catch (WebException ex1)
            {
                Console.WriteLine(ex1);
            }
            catch (OutOfMemoryException ex2)
            {
                Console.WriteLine(ex2);
            }
            catch (IOException ex3)
            {
                Console.WriteLine(ex3);
            }

            MatchCollection matches = Regex.Matches(m_Html, @"(<span><a href=\"")(.*?)(</a>)");

            for (int i = 0; i < matches.Count; i++)
            {
                string temp = matches[i].Groups[2].Value;
                string url  = temp.Substring(0, temp.IndexOf("\""));
                string name = temp.Substring(temp.IndexOf(">") + 1);

                Academincian aca = new Academincian();
                aca.name  = name;
                aca.url   = url;
                aca.flage = "中国科学院";
                if (!acaold.Contains("中国科学院" + name))
                {
                    string[] result = searcherdetail1(url);
                    aca.imageurl = result[0];
                    aca.detail1  = result[1];
                    aca.detail2  = result[2];
                    acas.Add(aca);
                    Console.WriteLine(name);
                }
            }


            /*
             * HtmlAgilityPack.HtmlDocument m_Document = new HtmlAgilityPack.HtmlDocument();
             * m_Document.LoadHtml(m_Html);
             * HtmlNode em = m_Document.GetElementbyId("allNameBar");
             * HtmlNodeCollection ems = em.ChildNodes;
             * foreach (HtmlNode em2 in ems)
             * {
             *  if (em2.Name=="dd")
             *  {
             *      HtmlNodeCollection ems2 = em2.ChildNodes;
             *      foreach (HtmlNode em3 in ems2)
             *      {
             *          if (em3.Name == "span")
             *          {
             *              HtmlNode em4=em3.FirstChild;
             *              String name = em4.InnerText;
             *              String url = em4.GetAttributeValue("href", "");
             *              Console.WriteLine(name+":"+url);
             *              if(url=="")
             *              {
             *                  Console.WriteLine(name+"没有找到链接");
             *              }
             *
             *                  Academincian aca=new Academincian();
             *                  aca.name =name;
             *                  aca.url = url;
             *                  aca.flage = "中国科学院";
             *
             *          }
             *      }
             *  }
             * }
             * */
            return(acas);
        }
示例#2
0
        private static List <Academincian> searcheraca2(HashSet <string> acaold)
        {
            /**
             * 搜索中国工程院
             */
            List <Academincian> acas = new List <Academincian>();
            HttpWebRequest      request;

            request = (HttpWebRequest)WebRequest.Create(URL2);
            //request.Method = "GET"; //Post请求方式
            //request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
            HttpWebResponse response;
            string          m_Html = "";
            string          sLine  = "";

            try
            {
                // Stream writer = request.GetRequestStream(); //获得请求流
                response = (HttpWebResponse)request.GetResponse(); //获得响应流
                Stream s;
                s = response.GetResponseStream();
                StreamReader objReader = new StreamReader(s, System.Text.Encoding.UTF8);
                int          i         = 0;
                while (sLine != null)
                {
                    i++;
                    sLine = objReader.ReadLine();

                    if (sLine != null)
                    {
                        m_Html += sLine;
                    }
                }
            }
            catch (WebException ex1)
            {
                Console.WriteLine(ex1);
            }
            catch (OutOfMemoryException ex2)
            {
                Console.WriteLine(ex2);
            }
            catch (IOException ex3)
            {
                Console.WriteLine(ex3);
            }

            MatchCollection  matches = Regex.Matches(m_Html, @"name_list([\s\S]*?)(</a></li>)");
            int              cont    = matches.Count;
            HashSet <String> urls    = new HashSet <string>();

            for (int i = 0; i < cont; i++)
            {
                string temp  = matches[i].Value;
                Match  match = Regex.Match(temp, @"(href=)([\s\S]*?)( t)");
                string url   = "http://www.cae.cn" + match.Value.Replace("href=\"", "").Replace("\" t", "").Replace("jump", "introduction");
                if (urls.Add(url))
                {
                    match = Regex.Match(temp, @"([\u4E00-\u9FFF]+)");
                    String name = match.Value;
                    if (!acaold.Contains("中国工程院" + name))
                    {
                        Academincian academician = new Academincian();
                        academician.name  = name;
                        academician.url   = url;
                        academician.flage = "中国工程院";
                        String[] result = new String[3];
                        result = searcherdetail2(url);
                        academician.imageurl = result[0];
                        academician.detail1  = result[1];
                        academician.detail2  = result[2];
                        Console.WriteLine(name);
                        acas.Add(academician);
                    }
                }
            }
            return(acas);
        }