static void Main(string[] args) { string str = AppDomain.CurrentDomain.BaseDirectory + @"\WIKIFX.csv"; string settingPath = AppDomain.CurrentDomain.BaseDirectory + @"\PathSetting.txt"; StreamReader fileStream = new StreamReader(settingPath, Encoding.Default); string htmlPath = fileStream.ReadToEnd(); var data = WebHandler.readCsvTxt(str); foreach (DataRow dr in data.Rows) { string url = dr[0].ToString(); string[] arry = url.Split('/'); string path = arry[4].Replace(".html", ""); try { WebHandler.GetFilterHtml(url, htmlPath + path, htmlPath); } catch (Exception e) { Console.WriteLine(e.Message); } } }
/// <summary> /// 获取网页转换后的字符串 /// </summary> /// <returns></returns> public static void GetFilterHtml(string url, string path, string imgPath) { try { Encoding code = Encoding.GetEncoding("UTF-8"); //声明文件编码 string ycase_left = ""; WebPage webInfo = new WebPage(url); string htmlSstring = webInfo.M_html; string[] arryfather = url.Split('/'); WriteFile(path, htmlSstring, code, arryfather[4]); List <string> listA = htmlFather(url, path + @"/" + arryfather[4]); //读取html头部 string htmltopPath = AppDomain.CurrentDomain.BaseDirectory + @"\htmltop.txt"; StreamReader fileStream = new StreamReader(htmltopPath, Encoding.Default); string htmlTop = fileStream.ReadToEnd(); //读取html底部 string htmlfilterPath = AppDomain.CurrentDomain.BaseDirectory + @"\htmlfitter.txt"; StreamReader fileStream1 = new StreamReader(htmlfilterPath, Encoding.Default); string htmlfilter = fileStream1.ReadToEnd(); htmlfilter = htmlfilter.Replace("?", "©"); //var y = webInfo.InsiteLinks; //List<string> listA = new List<string>(); //foreach (var item in y) //{ // listA.Add(item.NavigateUrl); //} ////List<string> listA = WebHandler.GetHtmlAttr(htmlSstring, "a", "href"); //List<string> a = new List<string>(); //List<string> LatA = new List<string>(); //foreach (string item in listA) //{ // if (item.Contains("/newsdetail") && !LatA.Contains(item)) // { // LatA.Add(item); // } //} //int i = 0; ////需要爬取的连接另存list //foreach (string item in a) //{ // i++; // if ((1 <= i && i<= 7) || 22<=i) // { // LatA.Add(item); // } //} //开始爬取网页子链接下的html foreach (string item in listA) { string[] arry = item.Split('/'); WebPage webInfoChirld = new WebPage("http://www.worldwayhk.com/" + item); string htmlSstringChirld = webInfoChirld.M_html; WriteFile(path, htmlSstringChirld, code, arry[1]); ycase_left = html(path + @"/" + arry[1]); ////去除html下的a标签链接属性 //List<string> listchirldA = WebHandler.GetHtmlAttr(ycase_left, "a", "href"); //foreach (string list in listchirldA) //{ // ycase_left = ycase_left.Replace(list, ""); //} //获取网页图片 List <string> listImg = WebHandler.GetHtmlAttr(ycase_left, "img", "src"); foreach (string img in listImg) { if (img.Contains("http://www.worldwayhk.com")) { string imguRL = img.Replace("http://www.worldwayhk.com", "").Replace("/asp.net/../", "/"); //替换网页上的图片地址 ycase_left = ycase_left.Replace(img, ".." + imguRL); } else { //替换网页上的图片地址 ycase_left = ycase_left.Replace(img, ".." + img); } } // string htmlTop = ""; //html1(path + @"/" + arry[1]); //ycase_left = "<!DOCTYPE html><html lang = \"en\"><head><meta charset = \"UTF-8\" ></head ><body>"+ycase_left; ycase_left = htmlTop + ycase_left; ycase_left = ycase_left + htmlfilter; //保存文章页面 WriteFile(path, ycase_left.Replace("/asp.net/../", "/"), code, arry[1]); foreach (string img in listImg) { string imguRL = img.Replace("http://www.worldwayhk.com", "").Replace("/asp.net/../", "/"); string[] arryImg = imguRL.Split('/'); string name = arryImg[5]; imguRL = "http://www.worldwayhk.com" + imguRL; BaoCun(imgPath + arryImg[1] + "/" + arryImg[2] + "/" + arryImg[3] + "/" + arryImg[4], imguRL, name); } //KindEditor / attached / image / 20160412 / 20160412140904_9121.jpg ///KindEditor/asp.net/../attached/image/20161019/20161019162715_5449.jpg } } catch (Exception e) { throw new Exception(e.Message); } }
/// <summary> /// 父页面所有需要抓取数据的连接 /// </summary> /// <param name="path"></param> /// <returns></returns> public static List <string> htmlFather(string url, string path) { List <string> LastA = new List <string>(); HtmlWeb htmlWeb = new HtmlWeb(); HtmlAgilityPack.HtmlDocument document = htmlWeb.Load(path); List <string> list = new List <string>(); list.Add("//div[@class='wss_UsaHot']"); bool isUsa = false; if (url.Contains("usa.html")) { isUsa = true; // list.Add("//div[@class='yymmg_info_us1']"); // list.Add("//div[@class='yymmg_info_us2']"); // list.Add("//div[@class='yymmg_info_us3']"); //list.Add("//div[@class='yymmg_info_us4']"); list.Add("//div[@class='yymmg_wrap']"); } else { list.Add("//div[@class='yymmg_wrap']"); } string htmlstring = ""; //int i = 1; foreach (string str in list) { HtmlNodeCollection collectionTi = document.DocumentNode.SelectNodes(str); if (collectionTi != null) { foreach (HtmlNode item in collectionTi) { int j = 0; htmlstring = item.OuterHtml; List <string> listA = WebHandler.GetHtmlAttr(htmlstring, "a", "href"); List <string> LatA = new List <string>(); foreach (string obj in listA) { if (obj.Contains("/newsdetail") && !LatA.Contains(obj)) { LatA.Add(obj); } } if (isUsa && str != list[0]) { //需要爬取的连接另存list foreach (string obj in LatA) { j++; if ((12 <= j && j <= 17) || (24 <= j && j <= 29) || (48 <= j && j <= 53) || (90 <= j && j <= 95)) { LastA.Add(obj); } } } else { foreach (string obj in LatA) { LastA.Add(obj); } } } } } return(LastA); }