//public List<string> GetAllMailbox(List<DeepWebPage> deepWebPages) //{ // List<string> Mailboxes = new List<string>(); // foreach (var dwp in deepWebPages) // { // } // return Mailboxes; //} public List <string> GetSingleMailbox(DeepWebPage deepWebPage) { List <string> smailboxes = new List <string>(); WebClient MyWebClient = new WebClient(); string htmlCode = ""; MyWebClient.Credentials = CredentialCache.DefaultCredentials; //获取或设置用于向Internet资源的请求进行身份验证的网络凭据 Byte[] pageData = MyWebClient.DownloadData(deepWebPage.webUrl); //从指定网站下载数据 if (IsUtf8(pageData)) { string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句 htmlCode = pageHtml; } else { string pageHtml = Encoding.Default.GetString(pageData); //如果获取网站页面采用的是GB2312,则使用这句 htmlCode = pageHtml; } /*暂时认为一个页面只有一个电子邮箱*/ string key1 = "<li>电子邮箱:<span><strong>";//寻找url的第一个键 int pos = htmlCode.IndexOf(key1); string key2 = "user-name"; int pos1 = htmlCode.IndexOf(key2); smailboxes.Add(htmlCode.Substring(pos1 + 11, htmlCode.Remove(0, pos1 + 11).IndexOf("</h5>")) + "\t" + htmlCode.Substring(pos + 23, htmlCode.Remove(0, pos + 23).IndexOf("</strong>")) + "\n");//24可修改 return(smailboxes); }
public List <DeepWebPage> getSonPage(String url) { List <DeepWebPage> deepWebPages = new List <DeepWebPage>(); string htmlCode = ""; WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials; //获取或设置用于向Internet资源的请求进行身份验证的网络凭据 Byte[] pageData = MyWebClient.DownloadData(url); //从指定网站下载数据 if (IsUtf8(pageData)) { string pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句 htmlCode = pageHtml; } else { string pageHtml = Encoding.Default.GetString(pageData); //如果获取网站页面采用的是GB2312,则使用这句 htmlCode = pageHtml; } string key1 = "http://pdc.hzau.edu.cn/jgfw/desktop/";//寻找url的第一个键 int pos = htmlCode.IndexOf(key1); IList <int> list = new List <int>(); while (pos > -1) { list.Add(pos); pos += key1.Length; if (pos >= htmlCode.Length) { break; } pos = htmlCode.IndexOf(key1, pos); } foreach (var item in list) { DeepWebPage deepWebPage = new DeepWebPage(); deepWebPage.webUrl = htmlCode.Substring(item, 66);//截取网址长度可以配 deepWebPages.Add(deepWebPage); //var l = htmlCode.Remove(0, item).IndexOf("\""); //surfaceWebPage.sonUrls.Add(htmlCode.Substring(item, 66)); //Console.WriteLine(htmlCode.Substring(item, item + 10).Length); } //foreach (var item in surfaceWebPage.sonUrls) //{ // Console.WriteLine(item); //} return(deepWebPages); }