Пример #1
0
    public void craw()
    {
        int urlIdx = 0;

        while (urlIdx < urlList.Count)
        {
            try
            {
                String url      = urlList[urlIdx];
                String filePath = "C:/WebClient/" + toFileName(url);
                Console.WriteLine(urlIdx + ":url=" + url + "\nfile=" + filePath);
                urlToFile(url, filePath);
                String html = fileToText(filePath);
                foreach (String childUrl in matches("\\shref\\s*=\\s*'(.*?)'", html, 1))
                {
                    int Already = 0;
                    foreach (String UrlinList in urlList)
                    {
                        if (UrlinList.Contains(childUrl))
                        {
                            Already = 1;
                        }
                    }
                    if ((childUrl.Contains("data.kaohsiung.gov.tw/Opendata/") || childUrl.Contains("List.aspx?Type=O&cidOrOrganid=") || childUrl.Contains("DetailList.aspx?")) && Already == 0)
                    {
                        Console.WriteLine(childUrl);
                        if (childUrl.Contains("http://data.kaohsiung.gov.tw/Opendata/"))
                        {
                            urlList.Add(childUrl);
                        }
                        else
                        {
                            urlList.Add("http://data.kaohsiung.gov.tw/Opendata/" + childUrl);
                        }
                    }
                }
            }
            catch
            {
                Console.WriteLine("Error:" + urlList[urlIdx] + " fail!");
            }
            urlIdx++;
        }
        Console.WriteLine("\nCompleted");
        Console.ReadLine();
    }
Пример #2
0
        public void craw()
        {
            int urlIdx = 0;

            while (urlIdx < urlList.Count)
            {
                try
                {
                    String url      = urlList[urlIdx];
                    String filePath = "Kaohsiung/WebPage/" + toFileName(url, 0);  //0表示這是保留網頁檔(.html)
                    //Console.WriteLine(urlIdx + ":url=" + url + "\nfile=" + filePath);
                    urlToFile(url, filePath);
                    String html = fileToText(filePath);
                    //尋找連結
                    foreach (String childUrl in matches("\\shref\\s*=\\s*'(.*?)'", html, 1))
                    {
                        int Already = 0;
                        foreach (String UrlinList in urlList)
                        {
                            if (UrlinList.Contains(childUrl))
                            {
                                Already = 1;
                            }
                        }
                        if ((childUrl.Contains("data.kaohsiung.gov.tw/Opendata/") || childUrl.Contains("List.aspx?Type=O&cidOrOrganid=") || childUrl.Contains("DetailList.aspx?")) && Already == 0)
                        {
                            //Console.WriteLine(childUrl);
                            if (childUrl.Contains("http://data.kaohsiung.gov.tw/Opendata/"))
                            {
                                urlList.Add(childUrl);
                            }
                            else
                            {
                                urlList.Add("http://data.kaohsiung.gov.tw/Opendata/" + childUrl);
                            }
                        }
                    }
                    //尋找資料
                    foreach (String OpenData in matches("<div><h3>(?:\\s|\\S)*</li></ul></p></div></div>", html, 0))
                    {
                        //資料所在頁面
                        //filePath = "DataPage/" + toFileName(url,1);  //1表示存為文字檔.txt
                        //urlToFile(url, filePath);

                        //資料內容
                        //Console.WriteLine(OpenData);
                        String UnitName = SubString(OpenData, '3', '/');;  //子字串起點與終點
                        Console.WriteLine(UnitName);
                        filePath = "Kaohsiung/Data/" + toFileName(UnitName, 1);

                        String       CleanData = TagCleaner(OpenData);
                        StreamWriter Sw        = new StreamWriter(filePath);
                        Sw.WriteLine(CleanData);
                        Sw.Close();
                    }
                }
                catch
                {
                    Console.WriteLine("Error:" + urlList[urlIdx] + " fail!");
                }
                urlIdx++;
            }
            Console.WriteLine("Kaohsiung Completed");
            //Console.ReadLine();
        }
Пример #3
0
        public void craw()
        {
            int urlIdx = 0;

            while (urlIdx < urlList.Count)
            {
                try
                {
                    //Console.WriteLine("urlIdx=" + urlIdx + ", urlList.Count=" + urlList.Count);
                    String url      = urlList[urlIdx];
                    String filePath = "NewTaipei/WebPage/" + toFileName(url, 0);  //0表示這是保留網頁檔(.html)
                    //Console.WriteLine(urlIdx + ":url=" + url);
                    urlToFile(url, filePath);
                    String html = fileToText(filePath);
                    //尋找連結
                    foreach (String childUrl in matches("(?:\\s|\\S)'?href'?\\s*.\\s*(?:'|\")(.*?)(?:'|\")", html, 1))
                    {
                        int Already = 0;
                        foreach (String UrlinList in urlList)
                        {
                            if (UrlinList.Contains(childUrl))
                            {
                                Already = 1;
                            }
                        }
                        if (childUrl.Contains("/NTPC/od/") && Already == 0)
                        {
                            //Console.WriteLine(childUrl);
                            if (childUrl.Contains("http://data.ntpc.gov.tw"))
                            {
                                urlList.Add(childUrl);
                            }
                            else
                            {
                                urlList.Add("http://data.ntpc.gov.tw" + childUrl);
                            }
                        }
                    }
                    //尋找資料
                    foreach (String OpenData in matches("<div class=\"title\" id=\"title\">\\s*新北市(?:\\s|\\S)*分  類(?:\\s|\\S)*<div style=\"margin-left:20px;margin-right:00px\">", html, 0))
                    {
                        //資料所在頁面
                        //filePath = "DataPage/" + toFileName(url,1);  //1表示存為文字檔.txt
                        //urlToFile(url, filePath);

                        //資料內容
                        //Console.WriteLine(OpenData);
                        String UnitName = SubString(OpenData, '新');;   //子字串起點與終點(都是從頭數起第一個符號)
                        Console.WriteLine(UnitName);
                        filePath = "NewTaipei/Data/" + toFileName(UnitName, 1);

                        String       CleanData = TagCleaner(OpenData);
                        StreamWriter Sw        = new StreamWriter(filePath);
                        Sw.WriteLine(UnitName + "\r\n" + CleanData);
                        Sw.Close();
                    }
                }
                catch
                {
                    Console.WriteLine("Error:" + urlList[urlIdx] + " fail!");
                }
                urlIdx++;
            }
            Console.WriteLine("NewTaipei Completed");
            //Console.ReadLine();
        }