public void craw() { int urlIdx = 0; while (urlIdx < urlList.Count) { try { String url = urlList[urlIdx]; String filePath = "C:/WebClient/" + toFileName(url); Console.WriteLine(urlIdx + ":url=" + url + "\nfile=" + filePath); urlToFile(url, filePath); String html = fileToText(filePath); foreach (String childUrl in matches("\\shref\\s*=\\s*'(.*?)'", html, 1)) { int Already = 0; foreach (String UrlinList in urlList) { if (UrlinList.Contains(childUrl)) { Already = 1; } } if ((childUrl.Contains("data.kaohsiung.gov.tw/Opendata/") || childUrl.Contains("List.aspx?Type=O&cidOrOrganid=") || childUrl.Contains("DetailList.aspx?")) && Already == 0) { Console.WriteLine(childUrl); if (childUrl.Contains("http://data.kaohsiung.gov.tw/Opendata/")) { urlList.Add(childUrl); } else { urlList.Add("http://data.kaohsiung.gov.tw/Opendata/" + childUrl); } } } } catch { Console.WriteLine("Error:" + urlList[urlIdx] + " fail!"); } urlIdx++; } Console.WriteLine("\nCompleted"); Console.ReadLine(); }
public void craw() { int urlIdx = 0; while (urlIdx < urlList.Count) { try { String url = urlList[urlIdx]; String filePath = "Kaohsiung/WebPage/" + toFileName(url, 0); //0表示這是保留網頁檔(.html) //Console.WriteLine(urlIdx + ":url=" + url + "\nfile=" + filePath); urlToFile(url, filePath); String html = fileToText(filePath); //尋找連結 foreach (String childUrl in matches("\\shref\\s*=\\s*'(.*?)'", html, 1)) { int Already = 0; foreach (String UrlinList in urlList) { if (UrlinList.Contains(childUrl)) { Already = 1; } } if ((childUrl.Contains("data.kaohsiung.gov.tw/Opendata/") || childUrl.Contains("List.aspx?Type=O&cidOrOrganid=") || childUrl.Contains("DetailList.aspx?")) && Already == 0) { //Console.WriteLine(childUrl); if (childUrl.Contains("http://data.kaohsiung.gov.tw/Opendata/")) { urlList.Add(childUrl); } else { urlList.Add("http://data.kaohsiung.gov.tw/Opendata/" + childUrl); } } } //尋找資料 foreach (String OpenData in matches("<div><h3>(?:\\s|\\S)*</li></ul></p></div></div>", html, 0)) { //資料所在頁面 //filePath = "DataPage/" + toFileName(url,1); //1表示存為文字檔.txt //urlToFile(url, filePath); //資料內容 //Console.WriteLine(OpenData); String UnitName = SubString(OpenData, '3', '/');; //子字串起點與終點 Console.WriteLine(UnitName); filePath = "Kaohsiung/Data/" + toFileName(UnitName, 1); String CleanData = TagCleaner(OpenData); StreamWriter Sw = new StreamWriter(filePath); Sw.WriteLine(CleanData); Sw.Close(); } } catch { Console.WriteLine("Error:" + urlList[urlIdx] + " fail!"); } urlIdx++; } Console.WriteLine("Kaohsiung Completed"); //Console.ReadLine(); }
public void craw() { int urlIdx = 0; while (urlIdx < urlList.Count) { try { //Console.WriteLine("urlIdx=" + urlIdx + ", urlList.Count=" + urlList.Count); String url = urlList[urlIdx]; String filePath = "NewTaipei/WebPage/" + toFileName(url, 0); //0表示這是保留網頁檔(.html) //Console.WriteLine(urlIdx + ":url=" + url); urlToFile(url, filePath); String html = fileToText(filePath); //尋找連結 foreach (String childUrl in matches("(?:\\s|\\S)'?href'?\\s*.\\s*(?:'|\")(.*?)(?:'|\")", html, 1)) { int Already = 0; foreach (String UrlinList in urlList) { if (UrlinList.Contains(childUrl)) { Already = 1; } } if (childUrl.Contains("/NTPC/od/") && Already == 0) { //Console.WriteLine(childUrl); if (childUrl.Contains("http://data.ntpc.gov.tw")) { urlList.Add(childUrl); } else { urlList.Add("http://data.ntpc.gov.tw" + childUrl); } } } //尋找資料 foreach (String OpenData in matches("<div class=\"title\" id=\"title\">\\s*新北市(?:\\s|\\S)*分 類(?:\\s|\\S)*<div style=\"margin-left:20px;margin-right:00px\">", html, 0)) { //資料所在頁面 //filePath = "DataPage/" + toFileName(url,1); //1表示存為文字檔.txt //urlToFile(url, filePath); //資料內容 //Console.WriteLine(OpenData); String UnitName = SubString(OpenData, '新');; //子字串起點與終點(都是從頭數起第一個符號) Console.WriteLine(UnitName); filePath = "NewTaipei/Data/" + toFileName(UnitName, 1); String CleanData = TagCleaner(OpenData); StreamWriter Sw = new StreamWriter(filePath); Sw.WriteLine(UnitName + "\r\n" + CleanData); Sw.Close(); } } catch { Console.WriteLine("Error:" + urlList[urlIdx] + " fail!"); } urlIdx++; } Console.WriteLine("NewTaipei Completed"); //Console.ReadLine(); }