Пример #1
0
        static void Main(string[] args)
        {
            string       str         = AppDomain.CurrentDomain.BaseDirectory + @"\WIKIFX.csv";
            string       settingPath = AppDomain.CurrentDomain.BaseDirectory + @"\PathSetting.txt";
            StreamReader fileStream  = new StreamReader(settingPath, Encoding.Default);
            string       htmlPath    = fileStream.ReadToEnd();
            var          data        = WebHandler.readCsvTxt(str);

            foreach (DataRow dr in data.Rows)
            {
                string   url  = dr[0].ToString();
                string[] arry = url.Split('/');
                string   path = arry[4].Replace(".html", "");
                try
                {
                    WebHandler.GetFilterHtml(url, htmlPath + path, htmlPath);
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                }
            }
        }
Пример #2
0
 /// <summary>
 /// 获取网页转换后的字符串
 /// </summary>
 /// <returns></returns>
 public static void GetFilterHtml(string url, string path, string imgPath)
 {
     try
     {
         Encoding code        = Encoding.GetEncoding("UTF-8"); //声明文件编码
         string   ycase_left  = "";
         WebPage  webInfo     = new WebPage(url);
         string   htmlSstring = webInfo.M_html;
         string[] arryfather  = url.Split('/');
         WriteFile(path, htmlSstring, code, arryfather[4]);
         List <string> listA = htmlFather(url, path + @"/" + arryfather[4]);
         //读取html头部
         string       htmltopPath = AppDomain.CurrentDomain.BaseDirectory + @"\htmltop.txt";
         StreamReader fileStream  = new StreamReader(htmltopPath, Encoding.Default);
         string       htmlTop     = fileStream.ReadToEnd();
         //读取html底部
         string       htmlfilterPath = AppDomain.CurrentDomain.BaseDirectory + @"\htmlfitter.txt";
         StreamReader fileStream1    = new StreamReader(htmlfilterPath, Encoding.Default);
         string       htmlfilter     = fileStream1.ReadToEnd();
         htmlfilter = htmlfilter.Replace("?", "©");
         //var y = webInfo.InsiteLinks;
         //List<string> listA = new List<string>();
         //foreach (var item in y)
         //{
         //    listA.Add(item.NavigateUrl);
         //}
         ////List<string> listA = WebHandler.GetHtmlAttr(htmlSstring, "a", "href");
         //List<string> a = new List<string>();
         //List<string> LatA = new List<string>();
         //foreach (string item in listA)
         //{
         //    if (item.Contains("/newsdetail") && !LatA.Contains(item))
         //    {
         //        LatA.Add(item);
         //    }
         //}
         //int i = 0;
         ////需要爬取的连接另存list
         //foreach (string item in a)
         //{
         //    i++;
         //    if ((1 <= i  && i<= 7) || 22<=i)
         //    {
         //        LatA.Add(item);
         //    }
         //}
         //开始爬取网页子链接下的html
         foreach (string item in listA)
         {
             string[] arry              = item.Split('/');
             WebPage  webInfoChirld     = new WebPage("http://www.worldwayhk.com/" + item);
             string   htmlSstringChirld = webInfoChirld.M_html;
             WriteFile(path, htmlSstringChirld, code, arry[1]);
             ycase_left = html(path + @"/" + arry[1]);
             ////去除html下的a标签链接属性
             //List<string> listchirldA = WebHandler.GetHtmlAttr(ycase_left, "a", "href");
             //foreach (string list in listchirldA)
             //{
             //    ycase_left = ycase_left.Replace(list, "");
             //}
             //获取网页图片
             List <string> listImg = WebHandler.GetHtmlAttr(ycase_left, "img", "src");
             foreach (string img in listImg)
             {
                 if (img.Contains("http://www.worldwayhk.com"))
                 {
                     string imguRL = img.Replace("http://www.worldwayhk.com", "").Replace("/asp.net/../", "/");
                     //替换网页上的图片地址
                     ycase_left = ycase_left.Replace(img, ".." + imguRL);
                 }
                 else
                 {
                     //替换网页上的图片地址
                     ycase_left = ycase_left.Replace(img, ".." + img);
                 }
             }
             // string htmlTop = ""; //html1(path + @"/" + arry[1]);
             //ycase_left = "<!DOCTYPE html><html lang = \"en\"><head><meta charset = \"UTF-8\" ></head ><body>"+ycase_left;
             ycase_left = htmlTop + ycase_left;
             ycase_left = ycase_left + htmlfilter;
             //保存文章页面
             WriteFile(path, ycase_left.Replace("/asp.net/../", "/"), code, arry[1]);
             foreach (string img in listImg)
             {
                 string   imguRL  = img.Replace("http://www.worldwayhk.com", "").Replace("/asp.net/../", "/");
                 string[] arryImg = imguRL.Split('/');
                 string   name    = arryImg[5];
                 imguRL = "http://www.worldwayhk.com" + imguRL;
                 BaoCun(imgPath + arryImg[1] + "/" + arryImg[2] + "/" + arryImg[3] + "/" + arryImg[4], imguRL, name);
             }
             //KindEditor / attached / image / 20160412 / 20160412140904_9121.jpg
             ///KindEditor/asp.net/../attached/image/20161019/20161019162715_5449.jpg
         }
     }
     catch (Exception e)
     {
         throw new Exception(e.Message);
     }
 }
Пример #3
0
        /// <summary>
        /// 父页面所有需要抓取数据的连接
        /// </summary>
        /// <param name="path"></param>
        /// <returns></returns>
        public static List <string> htmlFather(string url, string path)
        {
            List <string> LastA   = new List <string>();
            HtmlWeb       htmlWeb = new HtmlWeb();

            HtmlAgilityPack.HtmlDocument document = htmlWeb.Load(path);
            List <string> list = new List <string>();

            list.Add("//div[@class='wss_UsaHot']");
            bool isUsa = false;

            if (url.Contains("usa.html"))
            {
                isUsa = true;
                // list.Add("//div[@class='yymmg_info_us1']");
                // list.Add("//div[@class='yymmg_info_us2']");
                // list.Add("//div[@class='yymmg_info_us3']");
                //list.Add("//div[@class='yymmg_info_us4']");
                list.Add("//div[@class='yymmg_wrap']");
            }
            else
            {
                list.Add("//div[@class='yymmg_wrap']");
            }
            string htmlstring = "";

            //int i = 1;
            foreach (string str in list)
            {
                HtmlNodeCollection collectionTi = document.DocumentNode.SelectNodes(str);
                if (collectionTi != null)
                {
                    foreach (HtmlNode item in collectionTi)
                    {
                        int j = 0;
                        htmlstring = item.OuterHtml;
                        List <string> listA = WebHandler.GetHtmlAttr(htmlstring, "a", "href");
                        List <string> LatA  = new List <string>();
                        foreach (string obj in listA)
                        {
                            if (obj.Contains("/newsdetail") && !LatA.Contains(obj))
                            {
                                LatA.Add(obj);
                            }
                        }
                        if (isUsa && str != list[0])
                        {
                            //需要爬取的连接另存list
                            foreach (string obj in LatA)
                            {
                                j++;
                                if ((12 <= j && j <= 17) || (24 <= j && j <= 29) || (48 <= j && j <= 53) || (90 <= j && j <= 95))
                                {
                                    LastA.Add(obj);
                                }
                            }
                        }
                        else
                        {
                            foreach (string obj in LatA)
                            {
                                LastA.Add(obj);
                            }
                        }
                    }
                }
            }
            return(LastA);
        }