Exemple #1
0
        private async Task PTTWebCrawlerasync_DB(int pages, string str_board)
        {
            this.WindowState = FormWindowState.Minimized;
            Console.WriteLine("PTT Crawler_DB is Running");
            string baseurl   = "https://www.ptt.cc";
            string str_index = "index.html";
            string PageUp    = "";

            progressBar1.Maximum = 100;
            progressBar1.Value   = 0;
            progressBar1.Minimum = 0;
            int step = 100 / pages;
            //string str_url = "https://www.ptt.cc/bbs/Gossiping/index.html";
            string            str_url = $"{baseurl}/bbs/{str_board}/{str_index}";
            HttpClientHandler handler = new HttpClientHandler();

            handler.CookieContainer = new CookieContainer();
            Uri uri = new Uri(str_url);
            //Uri uris = new Uri("https://www.ptt.cc/bbs/Gossiping/");
            Uri uris = new Uri($"{baseurl}/bbs/{str_board}/");

            handler.CookieContainer.Add(uri, new Cookie("over18", "1")); // Adding a Cookie
                                                                         //handler.CookieContainer.Add(new Cookie("over18", "1") { Domain = uris.Host }); // Adding a Cookie

            HttpClient          httpClient = new HttpClient(handler);
            HttpResponseMessage response   = await httpClient.GetAsync(uri);

            string html = await httpClient.GetStringAsync(str_url);

            CookieCollection collection   = handler.CookieContainer.GetCookies(uri); // Retrieving a
            HtmlDocument     htmldocument = new HtmlDocument();

            htmldocument.LoadHtml(html);
            //MessageBox.Show(str_url);
            var list = new List <WebPTT>();

            for (int i = 1; i <= pages; i++)
            {
                try
                {
                    if (i > 1)
                    {
                        handler = new HttpClientHandler();
                        handler.CookieContainer = new CookieContainer();
                        uri = new Uri(str_url);
                        handler.CookieContainer.Add(uri, new Cookie("over18", "1"));
                        httpClient = new HttpClient(handler);
                        response   = await httpClient.GetAsync(uri);

                        collection = handler.CookieContainer.GetCookies(uri);
                        html       = await httpClient.GetStringAsync(str_url);

                        htmldocument = new HtmlDocument();
                        htmldocument.LoadHtml(html);
                    }

                    var getpageup = htmldocument.DocumentNode.Descendants("div")
                                    .Where(node => node.GetAttributeValue("class", "").Equals("btn-group btn-group-paging")).ToList();
                    foreach (var geturl in getpageup)
                    {
                        PageUp = geturl.Descendants("a").ElementAtOrDefault(1).ChildAttributes("href").FirstOrDefault().Value.ToString();
                    }

                    //System.Threading.Thread.Sleep(50);

                    var divs =
                        htmldocument.DocumentNode.Descendants("div")
                        .Where(node => node.GetAttributeValue("class", "").Equals("r-ent")).ToList();

                    foreach (var div in divs)
                    {
                        if (
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("公告")
                            ||
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("徵求")
                            ||
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("板務")
                            ||
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("問卷")
                            )
                        {
                            continue;
                        }
                        WebPTT webptt = new WebPTT();
                        if (div.Descendants("div").FirstOrDefault().InnerText.Trim().Contains("X"))
                        {
                            webptt.Popularity_ptt = webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim().Replace('X', '-');
                        }
                        else if (div.Descendants("div").FirstOrDefault().InnerText.Trim() == "")
                        {
                            webptt.Popularity_ptt = "0";
                        }
                        else
                        {
                            webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim();
                        }

                        webptt.title_ptt  = div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim();
                        webptt.author_ptt = div.Descendants("div").ElementAtOrDefault(2).Descendants("div").FirstOrDefault().InnerText.Trim();

                        try
                        {
                            webptt.URL_ptt = div.Descendants("a").FirstOrDefault().ChildAttributes("href").FirstOrDefault().Value;
                        }
                        catch (Exception)
                        {
                            webptt.URL_ptt = null;
                        }

                        #region 爬內文
                        if (webptt.URL_ptt != "" && webptt.URL_ptt != null)
                        {
                            try
                            {
                                handler = new HttpClientHandler();
                                handler.CookieContainer = new CookieContainer();
                                uri = new Uri(baseurl + webptt.URL_ptt);
                                handler.CookieContainer.Add(uri, new Cookie("over18", "1"));
                                httpClient = new HttpClient(handler);
                                response   = await httpClient.GetAsync(uri);

                                collection = handler.CookieContainer.GetCookies(uri);
                                html       = await httpClient.GetStringAsync(baseurl + webptt.URL_ptt);

                                htmldocument = new HtmlDocument();
                                htmldocument.LoadHtml(html);

                                var contents =
                                    htmldocument.DocumentNode.Descendants("div")
                                    .Where(node => node.GetAttributeValue("class", "").Equals("bbs-screen bbs-content")).ToList();
                                foreach (var content in contents)
                                {
                                    try
                                    {
                                        webptt.InnerText_ptt = content.InnerText;
                                        webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("看板"), "\n");
                                        webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("時間"), "\n");
                                    }
                                    catch (Exception Exwebptt)
                                    {
                                        Console.WriteLine($"WebPtt Cotent phase Error:{Exwebptt.Message}");
                                    }
                                }

                                var pushs =
                                    htmldocument.DocumentNode.Descendants("div")
                                    .Where(node => node.GetAttributeValue("class", "").Equals("push")).ToList();
                                List <WebPTT_Push> List_webptt_push = new List <WebPTT_Push>();
                                foreach (var push in pushs)
                                {
                                    try
                                    {
                                        WebPTT_Push obpush = new WebPTT_Push();
                                        obpush.push_tag = push.Descendants("span").ElementAtOrDefault(0).InnerText.Trim();
                                        obpush.user_id  = push.Descendants("span").ElementAtOrDefault(1).InnerText.Trim();
                                        obpush.context  = push.Descendants("span").ElementAtOrDefault(2).InnerText.Trim();
                                        obpush.datetime = push.Descendants("span").ElementAtOrDefault(3).InnerText.Trim();
                                        List_webptt_push.Add(obpush);
                                    }
                                    catch (Exception ExGetPushData)
                                    {
                                        Console.WriteLine($"Get Push info Data Error:{ExGetPushData.Message}");
                                    }
                                }
                                webptt.ptt_Push_info = new List <WebPTT_Push>(List_webptt_push);
                                List_webptt_push.Clear();//Clear Data
                            }
                            catch (Exception x)
                            {
                                Console.WriteLine("Context :" + x.Message);
                            }
                        }


                        #endregion

                        list.Add(webptt);
                    }

                    str_url = $"{baseurl}{PageUp}";
                    Console.WriteLine(str_url);
                    //$"{baseurl}/bbs/{str_board}/"
                }
                catch (Exception e)
                {
                    Console.WriteLine($"Crawling Error: {e.Message}");
                }
                progressBar1.Value += step;

                #region DB_Write
                string          dbHost   = "127.0.0.1"; //資料庫位址
                string          dbUser   = "******";   //資料庫使用者帳號
                string          dbPass   = "******";      //資料庫使用者密碼
                string          dbName   = "webptt";    //資料庫名稱
                string          connStr  = "server=" + dbHost + ";uid=" + dbUser + ";pwd=" + dbPass + ";database=" + dbName;
                MySqlConnection conn     = new MySqlConnection(connStr);
                MySqlCommand    command  = conn.CreateCommand();
                MySqlCommand    command1 = conn.CreateCommand();

                try
                {
                    /*Insert into PTTDATA (ptt_pop,ptt_title,ptt_author,ptt_url,ptt_context,ptt_date) values('Test_Title','U9062009','URL','Context','Date');*/
                    conn.Open();
                    foreach (var token in list)
                    {
                        if (token.InnerText_ptt != null)
                        {
                            dataGridView1.Rows.Add(token.Popularity_ptt, token.title_ptt, token.author_ptt, baseurl + token.URL_ptt, token.InnerText_ptt);
                            command.CommandText = @"Insert into PTTDATA (ptt_pop,ptt_title,ptt_author,ptt_url,ptt_context,ptt_date) values('"
                                                  + token.Popularity_ptt + "','" + token.title_ptt + "','" + token.author_ptt + "','" + baseurl + token.URL_ptt + "','" + token.InnerText_ptt
                                                  + "','" + "DATE" + "')";
                            command.ExecuteNonQuery();
                        }
                        try
                        {
                            foreach (var pushtoken in token.ptt_Push_info)
                            {
                                if (pushtoken.context != null)
                                {
                                    command1.CommandText = @"Insert into PTTpushDATA (push_Tag,push_User,push_Content,push_URL,push_Date) values('"
                                                           + pushtoken.push_tag + "','" + pushtoken.user_id + "','" + pushtoken.context + "','" + token.URL_ptt + "','" + pushtoken.datetime + "')";
                                }
                                command1.ExecuteNonQuery();
                            }
                        }
                        catch (Exception Push_DB)
                        {
                            Console.WriteLine($"Push_DB Error:{Push_DB.Message}");
                        }
                    }
                }
                catch (Exception DBEX)
                {
                    Console.WriteLine($"DB Connect Error:{DBEX.Message}");
                }
                finally
                {
                    conn.Close();
                }
                #endregion
                list.Clear();
            }

            #region Result
            progressBar1.Value = 0;
            Console.Clear();
            Console.WriteLine("Done");
            Console.WriteLine("TotalData:" + dataGridView1.RowCount.ToString());
            Console.Beep();
            #endregion
            this.WindowState = FormWindowState.Normal;
        }
Exemple #2
0
        private async Task PTTWebCrawlerasync(int pages, string str_board)
        {
            this.WindowState = FormWindowState.Minimized;
            Console.WriteLine("PTT Crawler is Running");
            string baseurl   = "https://www.ptt.cc";
            string str_index = "index.html";
            string PageUp    = "";

            progressBar1.Maximum = 100;
            progressBar1.Value   = 0;
            progressBar1.Minimum = 0;
            int step = 100 / pages;
            //string str_url = "https://www.ptt.cc/bbs/Gossiping/index.html";
            string            str_url = $"{baseurl}/bbs/{str_board}/{str_index}";
            HttpClientHandler handler = new HttpClientHandler();

            handler.CookieContainer = new CookieContainer();
            Uri uri = new Uri(str_url);
            //Uri uris = new Uri("https://www.ptt.cc/bbs/Gossiping/");
            Uri uris = new Uri($"{baseurl}/bbs/{str_board}/");

            handler.CookieContainer.Add(uri, new Cookie("over18", "1")); // Adding a Cookie
                                                                         //handler.CookieContainer.Add(new Cookie("over18", "1") { Domain = uris.Host }); // Adding a Cookie

            HttpClient          httpClient = new HttpClient(handler);
            HttpResponseMessage response   = await httpClient.GetAsync(uri);

            string html = await httpClient.GetStringAsync(str_url);

            CookieCollection collection   = handler.CookieContainer.GetCookies(uri); // Retrieving a
            HtmlDocument     htmldocument = new HtmlDocument();

            htmldocument.LoadHtml(html);
            //MessageBox.Show(str_url);
            var list = new List <WebPTT>();

            for (int i = 1; i <= pages; i++)
            {
                try
                {
                    if (i > 1)
                    {
                        handler = new HttpClientHandler();
                        handler.CookieContainer = new CookieContainer();
                        uri = new Uri(str_url);
                        handler.CookieContainer.Add(uri, new Cookie("over18", "1"));
                        httpClient = new HttpClient(handler);
                        response   = await httpClient.GetAsync(uri);

                        collection = handler.CookieContainer.GetCookies(uri);
                        html       = await httpClient.GetStringAsync(str_url);

                        htmldocument = new HtmlDocument();
                        htmldocument.LoadHtml(html);
                    }

                    var getpageup = htmldocument.DocumentNode.Descendants("div")
                                    .Where(node => node.GetAttributeValue("class", "").Equals("btn-group btn-group-paging")).ToList();
                    foreach (var geturl in getpageup)
                    {
                        PageUp = geturl.Descendants("a").ElementAtOrDefault(1).ChildAttributes("href").FirstOrDefault().Value.ToString();
                    }

                    //System.Threading.Thread.Sleep(50);

                    var divs =
                        htmldocument.DocumentNode.Descendants("div")
                        .Where(node => node.GetAttributeValue("class", "").Equals("r-ent")).ToList();

                    foreach (var div in divs)
                    {
                        if (
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("公告")
                            ||
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("徵求")
                            ||
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("板務")
                            ||
                            div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("問卷")
                            )
                        {
                            continue;
                        }
                        WebPTT webptt = new WebPTT();
                        if (div.Descendants("div").FirstOrDefault().InnerText.Trim().Contains("X"))
                        {
                            webptt.Popularity_ptt = webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim().Replace('X', '-');
                        }
                        else if (div.Descendants("div").FirstOrDefault().InnerText.Trim() == "")
                        {
                            webptt.Popularity_ptt = "0";
                        }
                        else
                        {
                            webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim();
                        }

                        webptt.title_ptt  = div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim();
                        webptt.author_ptt = div.Descendants("div").ElementAtOrDefault(2).Descendants("div").FirstOrDefault().InnerText.Trim();

                        try
                        {
                            webptt.URL_ptt = div.Descendants("a").FirstOrDefault().ChildAttributes("href").FirstOrDefault().Value;
                        }
                        catch (Exception)
                        {
                            webptt.URL_ptt = null;
                        }

                        #region 爬內文
                        if (webptt.URL_ptt != "" && webptt.URL_ptt != null)
                        {
                            try
                            {
                                handler = new HttpClientHandler();
                                handler.CookieContainer = new CookieContainer();
                                uri = new Uri(baseurl + webptt.URL_ptt);
                                handler.CookieContainer.Add(uri, new Cookie("over18", "1"));
                                httpClient = new HttpClient(handler);
                                response   = await httpClient.GetAsync(uri);

                                collection = handler.CookieContainer.GetCookies(uri);
                                html       = await httpClient.GetStringAsync(baseurl + webptt.URL_ptt);

                                htmldocument = new HtmlDocument();
                                htmldocument.LoadHtml(html);

                                var contents =
                                    htmldocument.DocumentNode.Descendants("div")
                                    .Where(node => node.GetAttributeValue("class", "").Equals("bbs-screen bbs-content")).ToList();
                                foreach (var content in contents)
                                {
                                    try
                                    {
                                        webptt.InnerText_ptt = content.InnerText;
                                        webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("看板"), "\n");
                                        webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("時間"), "\n");
                                    }
                                    catch (Exception Exwebptt)
                                    {
                                        Console.WriteLine($"WebPtt Cotent phase Error:{Exwebptt.Message}");
                                    }
                                }

                                var pushs =
                                    htmldocument.DocumentNode.Descendants("div")
                                    .Where(node => node.GetAttributeValue("class", "").Equals("push")).ToList();
                                List <WebPTT_Push> List_webptt_push = new List <WebPTT_Push>();
                                foreach (var push in pushs)
                                {
                                    try
                                    {
                                        WebPTT_Push obpush = new WebPTT_Push();
                                        obpush.push_tag = push.Descendants("span").ElementAtOrDefault(0).InnerText.Trim();
                                        obpush.user_id  = push.Descendants("span").ElementAtOrDefault(1).InnerText.Trim();
                                        obpush.context  = push.Descendants("span").ElementAtOrDefault(2).InnerText.Trim();
                                        obpush.datetime = push.Descendants("span").ElementAtOrDefault(3).InnerText.Trim();
                                        List_webptt_push.Add(obpush);
                                    }
                                    catch (Exception ExGetPushData)
                                    {
                                        Console.WriteLine($"Get Push info Data Error:{ExGetPushData.Message}");
                                    }
                                }
                                webptt.ptt_Push_info = List_webptt_push;
                                List_webptt_push.Clear();//Clear Data
                            }
                            catch (Exception x)
                            {
                                Console.WriteLine("Context :" + x.Message);
                            }
                        }


                        #endregion

                        list.Add(webptt);
                    }
                    //Console.WriteLine(PageUp);
                    //str_url = baseurl + "/bbs/Gossiping/index" + PageUp + ".html";
                    //str_url = $"{baseurl}/{PageUp}";
                    str_url = $"{baseurl}{PageUp}";
                    Console.WriteLine(str_url);
                    //$"{baseurl}/bbs/{str_board}/"
                }
                catch (Exception e)
                {
                    Console.WriteLine($"Crawling Error: {e.Message}");
                }
                progressBar1.Value += step;
                //Console.Write(" " + i.ToString());
                foreach (var token in list)
                {
                    File.AppendAllText("log.text", token.Popularity_ptt + "   " + token.title_ptt + "   " + token.author_ptt + " " + token.URL_ptt + "\n");
                    dataGridView1.Rows.Add(token.Popularity_ptt, token.title_ptt, token.author_ptt, baseurl + token.URL_ptt, token.InnerText_ptt);
                }
                list.Clear();
            }

            #region Result
            progressBar1.Value = 0;
            Console.Clear();
            Console.WriteLine("Done");
            Console.WriteLine("TotalData:" + dataGridView1.RowCount.ToString());
            #endregion
            this.WindowState = FormWindowState.Normal;
        }