private static List <Academincian> searcheraca1(HashSet <string> acaold) { /** * 搜索中国科学院院士 */ List <Academincian> acas = new List <Academincian>(); HttpWebRequest request; request = (HttpWebRequest)WebRequest.Create(URL1); request.Method = "POST"; //Post请求方式 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)"; HttpWebResponse response; string m_Html = ""; string sLine = ""; try { Stream writer = request.GetRequestStream(); //获得请求流 response = (HttpWebResponse)request.GetResponse(); //获得响应流 Stream s; s = response.GetResponseStream(); StreamReader objReader = new StreamReader(s, System.Text.Encoding.UTF8); int i = 0; while (sLine != null) { i++; sLine = objReader.ReadLine(); if (sLine != null) { m_Html += sLine; } } } catch (WebException ex1) { Console.WriteLine(ex1); } catch (OutOfMemoryException ex2) { Console.WriteLine(ex2); } catch (IOException ex3) { Console.WriteLine(ex3); } MatchCollection matches = Regex.Matches(m_Html, @"(<span><a href=\"")(.*?)(</a>)"); for (int i = 0; i < matches.Count; i++) { string temp = matches[i].Groups[2].Value; string url = temp.Substring(0, temp.IndexOf("\"")); string name = temp.Substring(temp.IndexOf(">") + 1); Academincian aca = new Academincian(); aca.name = name; aca.url = url; aca.flage = "中国科学院"; if (!acaold.Contains("中国科学院" + name)) { string[] result = searcherdetail1(url); aca.imageurl = result[0]; aca.detail1 = result[1]; aca.detail2 = result[2]; acas.Add(aca); Console.WriteLine(name); } } /* * HtmlAgilityPack.HtmlDocument m_Document = new HtmlAgilityPack.HtmlDocument(); * m_Document.LoadHtml(m_Html); * HtmlNode em = m_Document.GetElementbyId("allNameBar"); * HtmlNodeCollection ems = em.ChildNodes; * foreach (HtmlNode em2 in ems) * { * if (em2.Name=="dd") * { * HtmlNodeCollection ems2 = em2.ChildNodes; * foreach (HtmlNode em3 in ems2) * { * if (em3.Name == "span") * { * HtmlNode em4=em3.FirstChild; * String name = em4.InnerText; * String url = em4.GetAttributeValue("href", ""); * Console.WriteLine(name+":"+url); * if(url=="") * { * Console.WriteLine(name+"没有找到链接"); * } * * Academincian aca=new Academincian(); * aca.name =name; * aca.url = url; * aca.flage = "中国科学院"; * * } * } * } * } * */ return(acas); }
private static List <Academincian> searcheraca2(HashSet <string> acaold) { /** * 搜索中国工程院 */ List <Academincian> acas = new List <Academincian>(); HttpWebRequest request; request = (HttpWebRequest)WebRequest.Create(URL2); //request.Method = "GET"; //Post请求方式 //request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)"; HttpWebResponse response; string m_Html = ""; string sLine = ""; try { // Stream writer = request.GetRequestStream(); //获得请求流 response = (HttpWebResponse)request.GetResponse(); //获得响应流 Stream s; s = response.GetResponseStream(); StreamReader objReader = new StreamReader(s, System.Text.Encoding.UTF8); int i = 0; while (sLine != null) { i++; sLine = objReader.ReadLine(); if (sLine != null) { m_Html += sLine; } } } catch (WebException ex1) { Console.WriteLine(ex1); } catch (OutOfMemoryException ex2) { Console.WriteLine(ex2); } catch (IOException ex3) { Console.WriteLine(ex3); } MatchCollection matches = Regex.Matches(m_Html, @"name_list([\s\S]*?)(</a></li>)"); int cont = matches.Count; HashSet <String> urls = new HashSet <string>(); for (int i = 0; i < cont; i++) { string temp = matches[i].Value; Match match = Regex.Match(temp, @"(href=)([\s\S]*?)( t)"); string url = "http://www.cae.cn" + match.Value.Replace("href=\"", "").Replace("\" t", "").Replace("jump", "introduction"); if (urls.Add(url)) { match = Regex.Match(temp, @"([\u4E00-\u9FFF]+)"); String name = match.Value; if (!acaold.Contains("中国工程院" + name)) { Academincian academician = new Academincian(); academician.name = name; academician.url = url; academician.flage = "中国工程院"; String[] result = new String[3]; result = searcherdetail2(url); academician.imageurl = result[0]; academician.detail1 = result[1]; academician.detail2 = result[2]; Console.WriteLine(name); acas.Add(academician); } } } return(acas); }