private async Task PTTWebCrawlerasync_DB(int pages, string str_board) { this.WindowState = FormWindowState.Minimized; Console.WriteLine("PTT Crawler_DB is Running"); string baseurl = "https://www.ptt.cc"; string str_index = "index.html"; string PageUp = ""; progressBar1.Maximum = 100; progressBar1.Value = 0; progressBar1.Minimum = 0; int step = 100 / pages; //string str_url = "https://www.ptt.cc/bbs/Gossiping/index.html"; string str_url = $"{baseurl}/bbs/{str_board}/{str_index}"; HttpClientHandler handler = new HttpClientHandler(); handler.CookieContainer = new CookieContainer(); Uri uri = new Uri(str_url); //Uri uris = new Uri("https://www.ptt.cc/bbs/Gossiping/"); Uri uris = new Uri($"{baseurl}/bbs/{str_board}/"); handler.CookieContainer.Add(uri, new Cookie("over18", "1")); // Adding a Cookie //handler.CookieContainer.Add(new Cookie("over18", "1") { Domain = uris.Host }); // Adding a Cookie HttpClient httpClient = new HttpClient(handler); HttpResponseMessage response = await httpClient.GetAsync(uri); string html = await httpClient.GetStringAsync(str_url); CookieCollection collection = handler.CookieContainer.GetCookies(uri); // Retrieving a HtmlDocument htmldocument = new HtmlDocument(); htmldocument.LoadHtml(html); //MessageBox.Show(str_url); var list = new List <WebPTT>(); for (int i = 1; i <= pages; i++) { try { if (i > 1) { handler = new HttpClientHandler(); handler.CookieContainer = new CookieContainer(); uri = new Uri(str_url); handler.CookieContainer.Add(uri, new Cookie("over18", "1")); httpClient = new HttpClient(handler); response = await httpClient.GetAsync(uri); collection = handler.CookieContainer.GetCookies(uri); html = await httpClient.GetStringAsync(str_url); htmldocument = new HtmlDocument(); htmldocument.LoadHtml(html); } var getpageup = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("btn-group btn-group-paging")).ToList(); foreach (var geturl in getpageup) { PageUp = geturl.Descendants("a").ElementAtOrDefault(1).ChildAttributes("href").FirstOrDefault().Value.ToString(); } //System.Threading.Thread.Sleep(50); var divs = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("r-ent")).ToList(); foreach (var div in divs) { if ( div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("公告") || div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("徵求") || div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("板務") || div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("問卷") ) { continue; } WebPTT webptt = new WebPTT(); if (div.Descendants("div").FirstOrDefault().InnerText.Trim().Contains("X")) { webptt.Popularity_ptt = webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim().Replace('X', '-'); } else if (div.Descendants("div").FirstOrDefault().InnerText.Trim() == "") { webptt.Popularity_ptt = "0"; } else { webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim(); } webptt.title_ptt = div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim(); webptt.author_ptt = div.Descendants("div").ElementAtOrDefault(2).Descendants("div").FirstOrDefault().InnerText.Trim(); try { webptt.URL_ptt = div.Descendants("a").FirstOrDefault().ChildAttributes("href").FirstOrDefault().Value; } catch (Exception) { webptt.URL_ptt = null; } #region 爬內文 if (webptt.URL_ptt != "" && webptt.URL_ptt != null) { try { handler = new HttpClientHandler(); handler.CookieContainer = new CookieContainer(); uri = new Uri(baseurl + webptt.URL_ptt); handler.CookieContainer.Add(uri, new Cookie("over18", "1")); httpClient = new HttpClient(handler); response = await httpClient.GetAsync(uri); collection = handler.CookieContainer.GetCookies(uri); html = await httpClient.GetStringAsync(baseurl + webptt.URL_ptt); htmldocument = new HtmlDocument(); htmldocument.LoadHtml(html); var contents = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("bbs-screen bbs-content")).ToList(); foreach (var content in contents) { try { webptt.InnerText_ptt = content.InnerText; webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("看板"), "\n"); webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("時間"), "\n"); } catch (Exception Exwebptt) { Console.WriteLine($"WebPtt Cotent phase Error:{Exwebptt.Message}"); } } var pushs = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("push")).ToList(); List <WebPTT_Push> List_webptt_push = new List <WebPTT_Push>(); foreach (var push in pushs) { try { WebPTT_Push obpush = new WebPTT_Push(); obpush.push_tag = push.Descendants("span").ElementAtOrDefault(0).InnerText.Trim(); obpush.user_id = push.Descendants("span").ElementAtOrDefault(1).InnerText.Trim(); obpush.context = push.Descendants("span").ElementAtOrDefault(2).InnerText.Trim(); obpush.datetime = push.Descendants("span").ElementAtOrDefault(3).InnerText.Trim(); List_webptt_push.Add(obpush); } catch (Exception ExGetPushData) { Console.WriteLine($"Get Push info Data Error:{ExGetPushData.Message}"); } } webptt.ptt_Push_info = new List <WebPTT_Push>(List_webptt_push); List_webptt_push.Clear();//Clear Data } catch (Exception x) { Console.WriteLine("Context :" + x.Message); } } #endregion list.Add(webptt); } str_url = $"{baseurl}{PageUp}"; Console.WriteLine(str_url); //$"{baseurl}/bbs/{str_board}/" } catch (Exception e) { Console.WriteLine($"Crawling Error: {e.Message}"); } progressBar1.Value += step; #region DB_Write string dbHost = "127.0.0.1"; //資料庫位址 string dbUser = "******"; //資料庫使用者帳號 string dbPass = "******"; //資料庫使用者密碼 string dbName = "webptt"; //資料庫名稱 string connStr = "server=" + dbHost + ";uid=" + dbUser + ";pwd=" + dbPass + ";database=" + dbName; MySqlConnection conn = new MySqlConnection(connStr); MySqlCommand command = conn.CreateCommand(); MySqlCommand command1 = conn.CreateCommand(); try { /*Insert into PTTDATA (ptt_pop,ptt_title,ptt_author,ptt_url,ptt_context,ptt_date) values('Test_Title','U9062009','URL','Context','Date');*/ conn.Open(); foreach (var token in list) { if (token.InnerText_ptt != null) { dataGridView1.Rows.Add(token.Popularity_ptt, token.title_ptt, token.author_ptt, baseurl + token.URL_ptt, token.InnerText_ptt); command.CommandText = @"Insert into PTTDATA (ptt_pop,ptt_title,ptt_author,ptt_url,ptt_context,ptt_date) values('" + token.Popularity_ptt + "','" + token.title_ptt + "','" + token.author_ptt + "','" + baseurl + token.URL_ptt + "','" + token.InnerText_ptt + "','" + "DATE" + "')"; command.ExecuteNonQuery(); } try { foreach (var pushtoken in token.ptt_Push_info) { if (pushtoken.context != null) { command1.CommandText = @"Insert into PTTpushDATA (push_Tag,push_User,push_Content,push_URL,push_Date) values('" + pushtoken.push_tag + "','" + pushtoken.user_id + "','" + pushtoken.context + "','" + token.URL_ptt + "','" + pushtoken.datetime + "')"; } command1.ExecuteNonQuery(); } } catch (Exception Push_DB) { Console.WriteLine($"Push_DB Error:{Push_DB.Message}"); } } } catch (Exception DBEX) { Console.WriteLine($"DB Connect Error:{DBEX.Message}"); } finally { conn.Close(); } #endregion list.Clear(); } #region Result progressBar1.Value = 0; Console.Clear(); Console.WriteLine("Done"); Console.WriteLine("TotalData:" + dataGridView1.RowCount.ToString()); Console.Beep(); #endregion this.WindowState = FormWindowState.Normal; }
private async Task PTTWebCrawlerasync(int pages, string str_board) { this.WindowState = FormWindowState.Minimized; Console.WriteLine("PTT Crawler is Running"); string baseurl = "https://www.ptt.cc"; string str_index = "index.html"; string PageUp = ""; progressBar1.Maximum = 100; progressBar1.Value = 0; progressBar1.Minimum = 0; int step = 100 / pages; //string str_url = "https://www.ptt.cc/bbs/Gossiping/index.html"; string str_url = $"{baseurl}/bbs/{str_board}/{str_index}"; HttpClientHandler handler = new HttpClientHandler(); handler.CookieContainer = new CookieContainer(); Uri uri = new Uri(str_url); //Uri uris = new Uri("https://www.ptt.cc/bbs/Gossiping/"); Uri uris = new Uri($"{baseurl}/bbs/{str_board}/"); handler.CookieContainer.Add(uri, new Cookie("over18", "1")); // Adding a Cookie //handler.CookieContainer.Add(new Cookie("over18", "1") { Domain = uris.Host }); // Adding a Cookie HttpClient httpClient = new HttpClient(handler); HttpResponseMessage response = await httpClient.GetAsync(uri); string html = await httpClient.GetStringAsync(str_url); CookieCollection collection = handler.CookieContainer.GetCookies(uri); // Retrieving a HtmlDocument htmldocument = new HtmlDocument(); htmldocument.LoadHtml(html); //MessageBox.Show(str_url); var list = new List <WebPTT>(); for (int i = 1; i <= pages; i++) { try { if (i > 1) { handler = new HttpClientHandler(); handler.CookieContainer = new CookieContainer(); uri = new Uri(str_url); handler.CookieContainer.Add(uri, new Cookie("over18", "1")); httpClient = new HttpClient(handler); response = await httpClient.GetAsync(uri); collection = handler.CookieContainer.GetCookies(uri); html = await httpClient.GetStringAsync(str_url); htmldocument = new HtmlDocument(); htmldocument.LoadHtml(html); } var getpageup = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("btn-group btn-group-paging")).ToList(); foreach (var geturl in getpageup) { PageUp = geturl.Descendants("a").ElementAtOrDefault(1).ChildAttributes("href").FirstOrDefault().Value.ToString(); } //System.Threading.Thread.Sleep(50); var divs = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("r-ent")).ToList(); foreach (var div in divs) { if ( div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("公告") || div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("徵求") || div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("板務") || div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim().Contains("問卷") ) { continue; } WebPTT webptt = new WebPTT(); if (div.Descendants("div").FirstOrDefault().InnerText.Trim().Contains("X")) { webptt.Popularity_ptt = webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim().Replace('X', '-'); } else if (div.Descendants("div").FirstOrDefault().InnerText.Trim() == "") { webptt.Popularity_ptt = "0"; } else { webptt.Popularity_ptt = div.Descendants("div").FirstOrDefault().InnerText.Trim(); } webptt.title_ptt = div.Descendants("div").ElementAtOrDefault(1).InnerText.Trim(); webptt.author_ptt = div.Descendants("div").ElementAtOrDefault(2).Descendants("div").FirstOrDefault().InnerText.Trim(); try { webptt.URL_ptt = div.Descendants("a").FirstOrDefault().ChildAttributes("href").FirstOrDefault().Value; } catch (Exception) { webptt.URL_ptt = null; } #region 爬內文 if (webptt.URL_ptt != "" && webptt.URL_ptt != null) { try { handler = new HttpClientHandler(); handler.CookieContainer = new CookieContainer(); uri = new Uri(baseurl + webptt.URL_ptt); handler.CookieContainer.Add(uri, new Cookie("over18", "1")); httpClient = new HttpClient(handler); response = await httpClient.GetAsync(uri); collection = handler.CookieContainer.GetCookies(uri); html = await httpClient.GetStringAsync(baseurl + webptt.URL_ptt); htmldocument = new HtmlDocument(); htmldocument.LoadHtml(html); var contents = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("bbs-screen bbs-content")).ToList(); foreach (var content in contents) { try { webptt.InnerText_ptt = content.InnerText; webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("看板"), "\n"); webptt.InnerText_ptt = webptt.InnerText_ptt.Insert(webptt.InnerText_ptt.IndexOf("時間"), "\n"); } catch (Exception Exwebptt) { Console.WriteLine($"WebPtt Cotent phase Error:{Exwebptt.Message}"); } } var pushs = htmldocument.DocumentNode.Descendants("div") .Where(node => node.GetAttributeValue("class", "").Equals("push")).ToList(); List <WebPTT_Push> List_webptt_push = new List <WebPTT_Push>(); foreach (var push in pushs) { try { WebPTT_Push obpush = new WebPTT_Push(); obpush.push_tag = push.Descendants("span").ElementAtOrDefault(0).InnerText.Trim(); obpush.user_id = push.Descendants("span").ElementAtOrDefault(1).InnerText.Trim(); obpush.context = push.Descendants("span").ElementAtOrDefault(2).InnerText.Trim(); obpush.datetime = push.Descendants("span").ElementAtOrDefault(3).InnerText.Trim(); List_webptt_push.Add(obpush); } catch (Exception ExGetPushData) { Console.WriteLine($"Get Push info Data Error:{ExGetPushData.Message}"); } } webptt.ptt_Push_info = List_webptt_push; List_webptt_push.Clear();//Clear Data } catch (Exception x) { Console.WriteLine("Context :" + x.Message); } } #endregion list.Add(webptt); } //Console.WriteLine(PageUp); //str_url = baseurl + "/bbs/Gossiping/index" + PageUp + ".html"; //str_url = $"{baseurl}/{PageUp}"; str_url = $"{baseurl}{PageUp}"; Console.WriteLine(str_url); //$"{baseurl}/bbs/{str_board}/" } catch (Exception e) { Console.WriteLine($"Crawling Error: {e.Message}"); } progressBar1.Value += step; //Console.Write(" " + i.ToString()); foreach (var token in list) { File.AppendAllText("log.text", token.Popularity_ptt + " " + token.title_ptt + " " + token.author_ptt + " " + token.URL_ptt + "\n"); dataGridView1.Rows.Add(token.Popularity_ptt, token.title_ptt, token.author_ptt, baseurl + token.URL_ptt, token.InnerText_ptt); } list.Clear(); } #region Result progressBar1.Value = 0; Console.Clear(); Console.WriteLine("Done"); Console.WriteLine("TotalData:" + dataGridView1.RowCount.ToString()); #endregion this.WindowState = FormWindowState.Normal; }